PyTorch深度学习基础知识详解
目录
1. 基础数学概念
1.1 标量、向量、矩阵和张量
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.mplot3d import Axes3D
# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)
# 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
#使用设备: cuda
# 1. 标量 (Scalar) - 0维张量
scalar = torch.tensor(3.14)
print(f"标量: {scalar}")
#标量: 3.140000104904175
print(f"标量形状: {scalar.shape}")
#标量形状: torch.Size([])
print(f"标量维度: {scalar.dim()}")
#标量维度: 0
# 2. 向量 (Vector) - 1维张量
vector = torch.tensor([1.0, 2.0, 3.0, 4.0])
print(f"\n向量: {vector}")
#向量: tensor([1., 2., 3., 4.])
print(f"向量形状: {vector.shape}")
#向量形状: torch.Size([4])
print(f"向量维度: {vector.dim()}")
#向量维度: 1
# 3. 矩阵 (Matrix) - 2维张量
matrix = torch.tensor([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(f"\n矩阵:\n{matrix}")
#矩阵:
#tensor([[1, 2, 3],
# [4, 5, 6],
# [7, 8, 9]])
print(f"矩阵形状: {matrix.shape}")
#矩阵形状: torch.Size([3, 3])
print(f"矩阵维度: {matrix.dim()}")
#矩阵维度: 2
# 4. 张量 (Tensor) - 多维数组
tensor_3d = torch.randn(2, 3, 4) # 2个3x4的矩阵
print(f"\n3维张量形状: {tensor_3d.shape}")
#3维张量形状: torch.Size([2, 3, 4])
print(f"3维张量维度: {tensor_3d.dim()}")
#3维张量维度: 3
1.2 张量可视化
# 可视化不同维度的张量
fig = plt.figure(figsize=(15, 4))
# 标量可视化
ax1 = fig.add_subplot(141)
ax1.text(0.5, 0.5, '3.14', fontsize=20, ha='center', va='center')
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.set_title('标量 (0D)')
ax1.axis('off')
# 向量可视化
ax2 = fig.add_subplot(142)
vector_data = [1, 2, 3, 4]
ax2.bar(range(len(vector_data)), vector_data, color='blue', alpha=0.7)
ax2.set_title('向量 (1D)')
ax2.set_xlabel('索引')
ax2.set_ylabel('值')
# 矩阵可视化
ax3 = fig.add_subplot(143)
matrix_data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
im = ax3.imshow(matrix_data, cmap='viridis', aspect='auto')
ax3.set_title('矩阵 (2D)')
for i in range(3):
for j in range(3):
ax3.text(j, i, str(matrix_data[i, j]), ha='center', va='center', color='white')
plt.colorbar(im, ax=ax3)
# 3D张量可视化(显示为多个2D切片)
ax4 = fig.add_subplot(144, projection='3d')
tensor_3d_data = np.random.randn(3, 3, 3)
x, y, z = np.meshgrid(range(3), range(3), range(3))
ax4.scatter(x, y, z, c=tensor_3d_data.flatten(), cmap='coolwarm', s=100, alpha=0.6)
ax4.set_title('3D张量')
ax4.set_xlabel('X')
ax4.set_ylabel('Y')
ax4.set_zlabel('Z')
plt.tight_layout()
plt.show()
–
2. PyTorch张量基础
2.1 张量创建与操作
# 创建张量的多种方式
# 1. 从Python列表创建
tensor_from_list = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
# 2. 创建特定形状的张量
zeros_tensor = torch.zeros(3, 4) # 3x4的零张量
ones_tensor = torch.ones(2, 3) # 2x3的全1张量
random_tensor = torch.randn(3, 3) # 3x3的随机张量(标准正态分布)
# 3. 创建等差数列
arange_tensor = torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace_tensor = torch.linspace(0, 1, 5) # 5个均匀分布的点
print("从列表创建的张量:")
#从列表创建的张量:
print(tensor_from_list)
#tensor([[1., 2.],
# [3., 4.]])
print(f"数据类型: {tensor_from_list.dtype}")
#数据类型: torch.float32
print(f"设备: {tensor_from_list.device}")
#设备: cpu
2.2 张量的基本运算
# 创建两个张量
a = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=device)
print(f"矩阵a:\n{a}")
#矩阵a:
#tensor([[1., 2.],
# [3., 4.]], device='cuda:0')
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=device)
print(f"矩阵b:\n{b}")
#矩阵b:
#tensor([[5., 6.],
# [7., 8.]], device='cuda:0')
# 基本算术运算
add_result = a + b # 逐元素加法
sub_result = a - b # 逐元素减法
mul_result = a * b # 逐元素乘法
div_result = a / b # 逐元素除法
#print(f"逐元素乘法 a * b:\n{mul_result}")
#逐元素乘法 a * b:
#tensor([[ 5., 12.],
# [21., 32.]], device='cuda:0')
# 矩阵运算
matmul_result = torch.matmul(a, b) # 矩阵乘法
# 或者使用 @ 操作符
matmul_result2 = a @ b
print(f"矩阵乘法 a @ b:\n{matmul_result}")
#矩阵乘法 a @ b:
#tensor([[19., 22.],
# [43., 50.]], device='cuda:0')
# 转置
transpose_result = a.T # 或 a.transpose(0, 1)
# 聚合操作
sum_result = a.sum() # 所有元素求和
mean_result = a.mean() # 平均值
max_result = a.max() # 最大值
min_result = a.min() # 最小值
2.3 张量的形状操作
# 创建一个张量
x = torch.randn(4, 3, 2)
print(f"原始形状: {x.shape}")
#原始形状: torch.Size([4, 3, 2])
# reshape操作
x_reshaped = x.reshape(6, 4) # 必须保证元素总数不变
print(f"reshape后: {x_reshaped.shape}")
#reshape后: torch.Size([6, 4])
# view操作(与reshape类似,但要求张量在内存中连续)
x_viewed = x.view(12, 2)
print(f"view后: {x_viewed.shape}")
#view后: torch.Size([12, 2])
# squeeze和unsqueeze
y = torch.randn(1, 3, 1, 4)
y_squeezed = y.squeeze() # 移除所有维度为1的维度
print(f"squeeze前: {y.shape}, squeeze后: {y_squeezed.shape}")
#squeeze前: torch.Size([1, 3, 1, 4]), squeeze后: torch.Size([3, 4])
y_unsqueezed = y_squeezed.unsqueeze(0) # 在第0维添加一个维度
print(f"unsqueeze后: {y_unsqueezed.shape}")
#unsqueeze后: torch.Size([1, 3, 4])
# 广播机制示例
a = torch.randn(3, 1)
b = torch.randn(1, 4)
c = a + b # 自动广播到 (3, 4)
print(f"广播结果形状: {c.shape}")
#广播结果形状: torch.Size([3, 4])
3. 计算图与自动求导
3.1 计算图概念
计算图是一种有向无环图(DAG),用于表示计算过程中的操作和数据流。
# 创建需要梯度的张量
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
# 构建计算图
z = x * y
w = z ** 2
loss = w + 5
print(f"x: {x}, requires_grad={x.requires_grad}")
#x: 2.0, requires_grad=True
print(f"y: {y}, requires_grad={y.requires_grad}")
#y: 3.0, requires_grad=True
print(f"z = x * y: {z}")
#z = x * y: 6.0
print(f"w = z^2: {w}")
#w = z^2: 36.0
print(f"loss = w + 5: {loss}")
#loss = w + 5: 41.0
# 查看计算图信息
print(f"\nloss的grad_fn: {loss.grad_fn}")
#loss的grad_fn: <AddBackward0 object at 0x0000024315868310>
print(f"w的grad_fn: {w.grad_fn}")
#w的grad_fn: <PowBackward0 object at 0x00000243136D70D0>
print(f"z的grad_fn: {z.grad_fn}")
#z的grad_fn: <MulBackward0 object at 0x0000024315868310>
3.2 计算图可视化
import networkx as nx
def visualize_computation_graph():
"""可视化简单的计算图"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# 前向传播图
G_forward = nx.DiGraph()
G_forward.add_edges_from([
('x', 'z = x * y'),
('y', 'z = x * y'),
('z = x * y', 'w = z²'),
('w = z²', 'loss = w + 5'),
('5', 'loss = w + 5')
])
pos_forward = {
'x': (0, 2),
'y': (0, 0),
'z = x * y': (2, 1),
'w = z²': (4, 1),
'5': (4, 0),
'loss = w + 5': (6, 1)
}
nx.draw(G_forward, pos_forward, ax=ax1, with_labels=True,
node_color='lightblue', node_size=2000, font_size=10,
arrows=True, arrowsize=20, edge_color='gray')
ax1.set_title('前向传播', fontsize=14)
# 反向传播图
G_backward = nx.DiGraph()
G_backward.add_edges_from([
('∂loss/∂loss = 1', '∂loss/∂w'),
('∂loss/∂w', '∂loss/∂z'),
('∂loss/∂z', '∂loss/∂x'),
('∂loss/∂z', '∂loss/∂y')
])
pos_backward = {
'∂loss/∂loss = 1': (6, 1),
'∂loss/∂w': (4, 1),
'∂loss/∂z': (2, 1),
'∂loss/∂x': (0, 2),
'∂loss/∂y': (0, 0)
}
nx.draw(G_backward, pos_backward, ax=ax2, with_labels=True,
node_color='lightcoral', node_size=2000, font_size=10,
arrows=True, arrowsize=20, edge_color='gray')
ax2.set_title('反向传播', fontsize=14)
plt.tight_layout()
plt.show()
visualize_computation_graph()
3.3 自动求导机制
# PyTorch的autograd自动求导
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
# 前向传播
z = x * y
w = z ** 2
loss = w + 5
# 反向传播
loss.backward()
# 查看梯度
print(f"∂loss/∂x = {x.grad}")
print(f"∂loss/∂y = {y.grad}")
# 手动验证梯度计算
# loss = (x*y)^2 + 5
# ∂loss/∂x = 2*x*y^2 = 2*2*9 = 36
# ∂loss/∂y = 2*x^2*y = 2*4*3 = 24
print(f"\n手动计算验证:")
print(f"∂loss/∂x = 2*x*y^2 = 2*{x.item()}*{y.item()}^2 = {2*x.item()*y.item()**2}")
print(f"∂loss/∂y = 2*x^2*y = 2*{x.item()}^2*{y.item()} = {2*x.item()**2*y.item()}")
∂loss/∂x = 36.0
∂loss/∂y = 24.0
手动计算验证:
∂loss/∂x = 2*x*y^2 = 2*2.0*3.0^2 = 36.0
∂loss/∂y = 2*x^2*y = 2*2.0^2*3.0 = 24.0
3.4 梯度累积与清零
# 梯度累积示例
x = torch.tensor(2.0, requires_grad=True)
# 第一次计算
y1 = x ** 2
y1.backward()
print(f"第一次backward后,x.grad = {x.grad}")
# 第二次计算(梯度会累积)
y2 = x ** 3
y2.backward()
print(f"第二次backward后(累积),x.grad = {x.grad}")
# 清零梯度
x.grad.zero_()
y3 = x ** 4
y3.backward()
print(f"清零后再次backward,x.grad = {x.grad}")
import torch
# 演示梯度累积的详细过程
x = torch.tensor(2.0, requires_grad=True)
print("=== 第一次backward ===")
y1 = x ** 2 # y1 = x²
print(f"y1 = {y1}")
print(f"dy1/dx = 2x = 2*{x} = {2*x}")
y1.backward()
print(f"第一次backward后,x.grad = {x.grad}")
print("\n=== 第二次backward(累积)===")
y2 = x ** 2 # y2 = x²
print(f"y2 = {y2}")
print(f"dy2/dx = 2x = 2*{x} = {2*x}")
y2.backward()
print(f"新梯度 = {2*x}")
print(f"累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = {x.grad}")
print("\n=== 清零后重新计算 ===")
x.grad.zero_() # 手动清零
y3 = x ** 4 # y3 = x⁴
print(f"y3 = {y3}")
print(f"dy3/dx = 4x³ = 4*{x}³ = 4*8 = {4 * (x**3)}")
y3.backward()
print(f"清零后的梯度 = {x.grad}")
=== 第一次backward ===
y1 = 4.0
dy1/dx = 2x = 2*2.0 = 4.0
第一次backward后,x.grad = 4.0
=== 第二次backward(累积)===
y2 = 4.0
dy2/dx = 2x = 2*2.0 = 4.0
新梯度 = 4.0
累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = 8.0
=== 清零后重新计算 ===
y3 = 16.0
dy3/dx = 4x³ = 4*2.0³ = 4*8 = 32.0
清零后的梯度 = 32.0
import torch
import torch.nn as nn
# 场景1: 模拟大批量训练(内存限制)
def simulate_large_batch_training():
"""模拟大批量训练:将大批量分解为小批量累积"""
model = nn.Linear(10, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# 模拟数据:假设我们想要批量大小为16,但内存只能处理4个样本
total_batch_size = 16
mini_batch_size = 4
accumulation_steps = total_batch_size // mini_batch_size
print(f"总批量大小: {total_batch_size}")
print(f"小批量大小: {mini_batch_size}")
print(f"累积步数: {accumulation_steps}")
# 生成模拟数据
all_data = torch.randn(total_batch_size, 10)
all_targets = torch.randn(total_batch_size, 1)
optimizer.zero_grad() # 清零梯度
total_loss = 0
for step in range(accumulation_steps):
# 获取小批量数据
start_idx = step * mini_batch_size
end_idx = start_idx + mini_batch_size
mini_batch_data = all_data[start_idx:end_idx]
mini_batch_targets = all_targets[start_idx:end_idx]
# 前向传播
outputs = model(mini_batch_data)
loss = nn.MSELoss()(outputs, mini_batch_targets)
# 反向传播(梯度累积)
loss.backward() # 梯度会自动累积
total_loss += loss.item()
print(f"步骤 {step+1}: loss = {loss.item():.4f}")
# 查看参数的梯度(累积过程)
if hasattr(model.weight, 'grad') and model.weight.grad is not None:
grad_norm = model.weight.grad.norm().item()
print(f" 累积梯度范数: {grad_norm:.4f}")
# 平均梯度(模拟大批量的效果)
for param in model.parameters():
if param.grad is not None:
param.grad /= accumulation_steps
# 更新参数
optimizer.step()
avg_loss = total_loss / accumulation_steps
print(f"\n平均损失: {avg_loss:.4f}")
print("参数更新完成")
return model
# 场景2: 多任务学习中的梯度累积
def multi_task_gradient_accumulation():
"""多任务学习:来自不同任务的梯度累积"""
shared_model = nn.Linear(10, 5)
task1_head = nn.Linear(5, 1)
task2_head = nn.Linear(5, 2)
optimizer = torch.optim.Adam(
list(shared_model.parameters()) +
list(task1_head.parameters()) +
list(task2_head.parameters()),
lr=0.001
)
# 模拟数据
x = torch.randn(8, 10)
y1 = torch.randn(8, 1) # 任务1目标
y2 = torch.randn(8, 2) # 任务2目标
optimizer.zero_grad()
# 共享特征提取
shared_features = shared_model(x)
# 任务1的损失和梯度
print("=== 任务1 ===")
output1 = task1_head(shared_features)
loss1 = nn.MSELoss()(output1, y1)
print(f"任务1损失: {loss1.item():.4f}")
loss1.backward(retain_graph=True) # 保留计算图用于任务2
# 检查共享层的梯度
if shared_model.weight.grad is not None:
grad_norm_after_task1 = shared_model.weight.grad.norm().item()
print(f"任务1后共享层梯度范数: {grad_norm_after_task1:.4f}")
# 任务2的损失和梯度(累积到任务1的梯度上)
print("\n=== 任务2 ===")
output2 = task2_head(shared_features)
loss2 = nn.MSELoss()(output2, y2)
print(f"任务2损失: {loss2.item():.4f}")
loss2.backward() # 梯度会累积到共享层
# 检查累积后的梯度
if shared_model.weight.grad is not None:
grad_norm_after_both = shared_model.weight.grad.norm().item()
print(f"两个任务后共享层梯度范数: {grad_norm_after_both:.4f}")
print(f"梯度增加了: {grad_norm_after_both - grad_norm_after_task1:.4f}")
# 更新所有参数
optimizer.step()
print("\n多任务梯度累积完成,参数已更新")
# 场景3: 演示为什么需要手动清零梯度
def why_zero_grad_needed():
"""演示不清零梯度的问题"""
x = torch.tensor([1.0, 2.0], requires_grad=True)
print("=== 不清零梯度的问题 ===")
for epoch in range(3):
print(f"\nEpoch {epoch + 1}:")
# 计算损失
y = (x ** 2).sum()
print(f"损失: {y.item()}")
print(f"理论梯度应该是: {2 * x}")
# 反向传播
y.backward()
print(f"实际累积梯度: {x.grad}")
if epoch == 0:
print("第一次正确 ✓")
else:
print("梯度被错误累积了! ✗")
# 注意:这里故意不清零梯度来演示问题
print("\n=== 正确做法:每次清零梯度 ===")
x.grad.zero_() # 重置
for epoch in range(3):
print(f"\nEpoch {epoch + 1}:")
# 清零梯度(正确做法)
if x.grad is not None:
x.grad.zero_()
# 计算损失
y = (x ** 2).sum()
print(f"损失: {y.item()}")
print(f"理论梯度应该是: {2 * x}")
# 反向传播
y.backward()
print(f"实际梯度: {x.grad}")
print("正确! ✓")
if __name__ == "__main__":
print("1. 模拟大批量训练")
print("=" * 50)
simulate_large_batch_training()
print("\n\n2. 多任务学习梯度累积")
print("=" * 50)
multi_task_gradient_accumulation()
print("\n\n3. 为什么需要手动清零梯度")
print("=" * 50)
why_zero_grad_needed()
1. 模拟大批量训练
==================================================
总批量大小: 16
小批量大小: 4
累积步数: 4
步骤 1: loss = 1.8221
累积梯度范数: 4.5243
步骤 2: loss = 2.0437
累积梯度范数: 6.4268
步骤 3: loss = 2.1350
累积梯度范数: 7.6952
步骤 4: loss = 1.1126
累积梯度范数: 7.5846
平均损失: 1.7784
参数更新完成
2. 多任务学习梯度累积
==================================================
=== 任务1 ===
任务1损失: 0.9983
任务1后共享层梯度范数: 1.3653
=== 任务2 ===
任务2损失: 1.3457
两个任务后共享层梯度范数: 1.7382
梯度增加了: 0.3728
多任务梯度累积完成,参数已更新
3. 为什么需要手动清零梯度
==================================================
=== 不清零梯度的问题 ===
Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([2., 4.])
第一次正确 ✓
Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([4., 8.])
梯度被错误累积了! ✗
Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([ 6., 12.])
梯度被错误累积了! ✗
=== 正确做法:每次清零梯度 ===
Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓
Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓
Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓
4. 链式法则与反向传播
4.1 链式法则原理
def chain_rule_example():
"""演示链式法则"""
# 定义函数: f(g(h(x)))
# h(x) = x^2
# g(h) = sin(h)
# f(g) = exp(g)
x = torch.tensor(0.5, requires_grad=True)
# 前向传播
h = x ** 2
g = torch.sin(h)
f = torch.exp(g)
# 反向传播
f.backward()
print("链式法则示例:")
print(f"x = {x.item():.4f}")
print(f"h = x^2 = {h.item():.4f}")
print(f"g = sin(h) = {g.item():.4f}")
print(f"f = exp(g) = {f.item():.4f}")
print(f"\n自动求导结果: df/dx = {x.grad.item():.4f}")
# 手动计算验证
df_dg = torch.exp(g)
dg_dh = torch.cos(h)
dh_dx = 2 * x
df_dx_manual = df_dg * dg_dh * dh_dx
print(f"手动计算结果: df/dx = {df_dx_manual.item():.4f}")
# 可视化链式法则
fig, ax = plt.subplots(figsize=(12, 4))
ax.text(0.1, 0.5, 'x', fontsize=20, ha='center', va='center',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue"))
ax.arrow(0.15, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
ax.text(0.3, 0.5, 'h=x²', fontsize=16, ha='center', va='center',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen"))
ax.arrow(0.38, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
ax.text(0.53, 0.5, 'g=sin(h)', fontsize=16, ha='center', va='center',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))
ax.arrow(0.63, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
ax.text(0.78, 0.5, 'f=exp(g)', fontsize=16, ha='center', va='center',
bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral"))
# 反向传播箭头
ax.arrow(0.78, 0.3, -0.15, 0, head_width=0.02, head_length=0.02,
fc='red', ec='red', linestyle='--', alpha=0.7)
ax.text(0.7, 0.25, 'df/dg', fontsize=12, color='red')
ax.arrow(0.53, 0.3, -0.15, 0, head_width=0.02, head_length=0.02,
fc='red', ec='red', linestyle='--', alpha=0.7)
ax.text(0.45, 0.25, 'dg/dh', fontsize=12, color='red')
ax.arrow(0.3, 0.3, -0.12, 0, head_width=0.02, head_length=0.02,
fc='red', ec='red', linestyle='--', alpha=0.7)
ax.text(0.22, 0.25, 'dh/dx', fontsize=12, color='red')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
ax.set_title('链式法则: df/dx = (df/dg) × (dg/dh) × (dh/dx)', fontsize=14)
plt.show()
chain_rule_example()
4.2 反向传播算法
class SimpleNN:
"""手动实现简单神经网络的前向和反向传播"""
def __init__(self, input_size, hidden_size, output_size):
# 初始化权重和偏置
self.W1 = torch.randn(input_size, hidden_size, requires_grad=True)
self.b1 = torch.zeros(hidden_size, requires_grad=True)
self.W2 = torch.randn(hidden_size, output_size, requires_grad=True)
self.b2 = torch.zeros(output_size, requires_grad=True)
def forward(self, x):
"""前向传播"""
# 第一层
self.z1 = x @ self.W1 + self.b1
self.a1 = torch.relu(self.z1)
# 第二层
self.z2 = self.a1 @ self.W2 + self.b2
self.output = torch.sigmoid(self.z2)
return self.output
def backward_manual(self, x, y, output):
"""手动实现反向传播(用于教学)"""
m = x.shape[0]
# 输出层梯度
dz2 = output - y
dW2 = (self.a1.T @ dz2) / m
db2 = torch.sum(dz2, dim=0) / m
# 隐藏层梯度
da1 = dz2 @ self.W2.T
dz1 = da1 * (self.z1 > 0).float() # ReLU的导数
dW1 = (x.T @ dz1) / m
db1 = torch.sum(dz1, dim=0) / m
return dW1, db1, dW2, db2
# 测试反向传播
torch.manual_seed(42)
model = SimpleNN(2, 3, 1)
# 生成样本数据
x = torch.randn(4, 2)
y = torch.tensor([[1.], [0.], [1.], [0.]])
# 前向传播
output = model.forward(x)
# 计算损失
loss = torch.mean((output - y) ** 2)
# PyTorch自动反向传播
loss.backward()
print("神经网络结构:")
print(f"输入层: 2个神经元")
print(f"隐藏层: 3个神经元 (ReLU激活)")
print(f"输出层: 1个神经元 (Sigmoid激活)")
print(f"\n损失值: {loss.item():.4f}")
print(f"\nW1的梯度形状: {model.W1.grad.shape}")
print(f"W2的梯度形状: {model.W2.grad.shape}")
神经网络结构:
输入层: 2个神经元
隐藏层: 3个神经元 (ReLU激活)
输出层: 1个神经元 (Sigmoid激活)
损失值: 0.4442
W1的梯度形状: torch.Size([2, 3])
W2的梯度形状: torch.Size([3, 1])
4.3 梯度消失和梯度爆炸
def gradient_issues_demo():
"""演示梯度消失和梯度爆炸问题"""
# 梯度消失示例(深层网络with sigmoid)
x = torch.randn(1, 10, requires_grad=True)
# 模拟深层网络
h = x
activations = [x.detach().numpy()]
for i in range(10):
h = torch.sigmoid(h)
activations.append(h.detach().numpy())
loss = h.sum()
loss.backward()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# 绘制激活值变化
for i, act in enumerate(activations):
ax1.plot(act.flatten(), label=f'Layer {i}', alpha=0.7)
ax1.set_title('梯度消失:Sigmoid激活值逐层递减')
ax1.set_xlabel('神经元索引')
ax1.set_ylabel('激活值')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# 梯度爆炸示例
x2 = torch.randn(1, 10, requires_grad=True)
h2 = x2
for i in range(5):
h2 = h2 * 2.0 # 每层权重都大于1
loss2 = h2.sum()
loss2.backward()
# 绘制梯度大小
gradient_magnitudes = [x.grad.abs().mean().item(), x2.grad.abs().mean().item()]
ax2.bar(['梯度消失\n(深层Sigmoid)', '梯度爆炸\n(权重>1)'],
gradient_magnitudes, color=['blue', 'red'], alpha=0.7)
ax2.set_ylabel('梯度绝对值均值')
ax2.set_title('梯度问题对比')
ax2.set_yscale('log')
plt.tight_layout()
plt.show()
print(f"梯度消失情况下的梯度均值: {gradient_magnitudes[0]:.2e}")
print(f"梯度爆炸情况下的梯度均值: {gradient_magnitudes[1]:.2e}")
gradient_issues_demo()
梯度消失情况下的梯度均值: 3.36e-07
梯度爆炸情况下的梯度均值: 3.20e+01
5. 雅可比矩阵
5.1 雅可比矩阵定义
雅可比矩阵描述了向量函数的导数,是多元函数偏导数的矩阵形式。
def jacobian_example():
"""雅可比矩阵示例"""
# 定义输入向量
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
# 定义向量函数 f: R^3 -> R^2
# f1 = x1^2 + x2*x3
# f2 = x1*x2 - x3^2
f1 = x[0]**2 + x[1]*x[2]
f2 = x[0]*x[1] - x[2]**2
# 输出向量
f = torch.stack([f1, f2])
# 计算雅可比矩阵
jacobian = torch.zeros(2, 3)
for i in range(2):
# 重置梯度
if x.grad is not None:
x.grad.zero_()
# 对第i个输出计算梯度
f[i].backward(retain_graph=True)
jacobian[i] = x.grad.clone()
print("雅可比矩阵:")
print("J = [∂f/∂x] =")
print(jacobian)
# 手动验证
print("\n手动计算验证:")
print(f"∂f1/∂x1 = 2*x1 = {2*x[0].item()}")
print(f"∂f1/∂x2 = x3 = {x[2].item()}")
print(f"∂f1/∂x3 = x2 = {x[1].item()}")
print(f"∂f2/∂x1 = x2 = {x[1].item()}")
print(f"∂f2/∂x2 = x1 = {x[0].item()}")
print(f"∂f2/∂x3 = -2*x3 = {-2*x[2].item()}")
# 可视化雅可比矩阵
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(jacobian.detach().numpy(), cmap='RdBu', aspect='auto', vmin=-10, vmax=10)
# 添加数值标签
for i in range(2):
for j in range(3):
text = ax.text(j, i, f'{jacobian[i, j].item():.1f}',
ha="center", va="center", color="black", fontsize=14)
ax.set_xticks([0, 1, 2])
ax.set_yticks([0, 1])
ax.set_xticklabels(['∂/∂x₁', '∂/∂x₂', '∂/∂x₃'])
ax.set_yticklabels(['f₁', 'f₂'])
ax.set_title('雅可比矩阵 J', fontsize=16)
plt.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()
jacobian_example()
雅可比矩阵:
J = [∂f/∂x] =
tensor([[ 2., 3., 2.],
[ 2., 1., -6.]])
手动计算验证:
∂f1/∂x1 = 2*x1 = 2.0
∂f1/∂x2 = x3 = 3.0
∂f1/∂x3 = x2 = 2.0
∂f2/∂x1 = x2 = 2.0
∂f2/∂x2 = x1 = 1.0
∂f2/∂x3 = -2*x3 = -6.0
5.2 雅可比矩阵在神经网络中的应用
# 使用torch.autograd.functional计算雅可比矩阵
from torch.autograd.functional import jacobian
def network_jacobian():
"""神经网络层的雅可比矩阵"""
# 定义一个简单的网络层
def layer(x):
W = torch.tensor([[1.0, -0.5],
[0.5, 2.0],
[-1.0, 1.0]])
b = torch.tensor([0.1, 0.2, -0.1])
return torch.relu(x @ W.T + b)
# 输入
x = torch.tensor([1.0, 2.0])
# 计算雅可比矩阵
J = jacobian(layer, x)
print("网络层函数: f(x) = ReLU(Wx + b)")
print(f"输入维度: {x.shape}")
print(f"输出维度: {layer(x).shape}")
print(f"\n雅可比矩阵形状: {J.shape}")
print("雅可比矩阵:")
print(J)
# 可视化输入输出关系
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# 绘制网络结构
ax1.set_xlim(-0.5, 2.5)
ax1.set_ylim(-0.5, 3.5)
# 输入节点
for i in range(2):
circle = plt.Circle((0, i+0.5), 0.2, color='lightblue', ec='black')
ax1.add_patch(circle)
ax1.text(0, i+0.5, f'x{i+1}', ha='center', va='center')
# 输出节点
for i in range(3):
circle = plt.Circle((2, i), 0.2, color='lightgreen', ec='black')
ax1.add_patch(circle)
ax1.text(2, i, f'y{i+1}', ha='center', va='center')
# 连接线
for i in range(2):
for j in range(3):
ax1.arrow(0.2, i+0.5, 1.6, j-i-0.5,
head_width=0.05, head_length=0.05,
fc='gray', ec='gray', alpha=0.5)
ax1.set_title('网络层结构', fontsize=14)
ax1.axis('off')
# 绘制雅可比矩阵热图
im = ax2.imshow(J.detach().numpy(), cmap='coolwarm', aspect='auto')
ax2.set_xlabel('输入维度')
ax2.set_ylabel('输出维度')
ax2.set_title('雅可比矩阵热图', fontsize=14)
plt.colorbar(im, ax=ax2)
plt.tight_layout()
plt.show()
network_jacobian()
网络层函数: f(x) = ReLU(Wx + b)
输入维度: torch.Size([2])
输出维度: torch.Size([3])
雅可比矩阵形状: torch.Size([3, 2])
雅可比矩阵:
tensor([[ 1.0000, -0.5000],
[ 0.5000, 2.0000],
[-1.0000, 1.0000]])
6. GPU与CPU兼容性
6.1 设备管理
# 检查CUDA可用性
print(f"CUDA是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA设备数量: {torch.cuda.device_count()}")
print(f"当前CUDA设备: {torch.cuda.current_device()}")
print(f"CUDA设备名称: {torch.cuda.get_device_name(0)}")
# 设备选择最佳实践
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n使用设备: {device}")
# 创建张量时指定设备
tensor_cpu = torch.randn(3, 3) # 默认在CPU上
tensor_gpu = torch.randn(3, 3, device=device) # 在指定设备上
print(f"\nCPU张量设备: {tensor_cpu.device}")
print(f"GPU/CPU张量设备: {tensor_gpu.device}")
CUDA是否可用: True
CUDA设备数量: 1
当前CUDA设备: 0
CUDA设备名称: NVIDIA GeForce RTX 3080
使用设备: cuda
CPU张量设备: cpu
GPU/CPU张量设备: cuda:0
6.2 设备间数据转移
# CPU到GPU
if torch.cuda.is_available():
# 方法1: 使用.to()
tensor_cpu = torch.randn(2, 3)
tensor_gpu = tensor_cpu.to('cuda')
# 方法2: 使用.cuda()
tensor_gpu2 = tensor_cpu.cuda()
# GPU到CPU
tensor_back_to_cpu = tensor_gpu.cpu()
print(f"原始张量设备: {tensor_cpu.device}")
print(f"转移到GPU后: {tensor_gpu.device}")
print(f"转回CPU后: {tensor_back_to_cpu.device}")
else:
print("CUDA不可用,使用CPU进行演示")
tensor = torch.randn(2, 3)
print(f"张量设备: {tensor.device}")
原始张量设备: cpu
转移到GPU后: cuda:0
转回CPU后: cpu
6.3 设备无关代码编写
class DeviceAgnosticModel(torch.nn.Module):
"""设备无关的模型示例"""
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = torch.nn.Linear(input_size, hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(hidden_size, output_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
def train_on_device(model, data, target, device, epochs=100):
"""设备无关的训练函数"""
# 将模型移到指定设备
model = model.to(device)
data = data.to(device)
target = target.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
losses = []
for epoch in range(epochs):
# 前向传播
output = model(data)
loss = criterion(output, target)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
if (epoch + 1) % 20 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
return losses
# 使用示例
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"训练设备: {device}")
# 创建模型和数据
model = DeviceAgnosticModel(10, 20, 1)
data = torch.randn(100, 10)
target = torch.randn(100, 1)
# 训练
losses = train_on_device(model, data, target, device, epochs=100)
# 绘制损失曲线
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.title(f'训练损失曲线 (设备: {device})')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.show()
Epoch [20/100], Loss: 0.8893
Epoch [40/100], Loss: 0.8566
Epoch [60/100], Loss: 0.8442
Epoch [80/100], Loss: 0.8342
Epoch [100/100], Loss: 0.8252
6.4 性能优化技巧
def performance_tips():
"""GPU/CPU性能优化技巧"""
print("性能优化技巧:")
print("-" * 50)
# 1. 批量操作
print("1. 使用批量操作而非循环:")
# 不推荐
start_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
end_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
x = torch.randn(1000, 100, device=device)
result_slow = torch.zeros(1000, device=device)
if torch.cuda.is_available():
start_time.record()
for i in range(1000):
result_slow[i] = x[i].sum()
if torch.cuda.is_available():
end_time.record()
torch.cuda.synchronize()
print(f" 循环方式时间: {start_time.elapsed_time(end_time):.2f} ms")
# 推荐
if torch.cuda.is_available():
start_time.record()
result_fast = x.sum(dim=1)
if torch.cuda.is_available():
end_time.record()
torch.cuda.synchronize()
print(f" 批量操作时间: {start_time.elapsed_time(end_time):.2f} ms")
# 2. 避免频繁的设备间数据传输
print("\n2. 最小化CPU-GPU数据传输:")
print(" - 在GPU上完成所有计算")
print(" - 只在必要时传回CPU")
# 3. 使用适当的数据类型
print("\n3. 使用合适的数据类型:")
tensor_float32 = torch.randn(1000, 1000, device=device)
tensor_float16 = tensor_float32.half() # 转换为半精度
print(f" Float32内存: {tensor_float32.element_size() * tensor_float32.nelement() / 1024**2:.2f} MB")
print(f" Float16内存: {tensor_float16.element_size() * tensor_float16.nelement() / 1024**2:.2f} MB")
# 4. 固定内存
print("\n4. 使用pin_memory加速数据加载:")
if torch.cuda.is_available():
# 创建DataLoader时使用pin_memory=True
print(" DataLoader(..., pin_memory=True)")
performance_tips()
性能优化技巧:
--------------------------------------------------
1. 使用批量操作而非循环:
循环方式时间: 32.97 ms
批量操作时间: 0.03 ms
2. 最小化CPU-GPU数据传输:
- 在GPU上完成所有计算
- 只在必要时传回CPU
3. 使用合适的数据类型:
Float32内存: 3.81 MB
Float16内存: 1.91 MB
4. 使用pin_memory加速数据加载:
DataLoader(..., pin_memory=True)
7. 完整示例:构建简单神经网络
7.1 完整的训练流程
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)
# 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 自定义数据集
class SpiralDataset(Dataset):
"""生成螺旋形分类数据"""
def __init__(self, n_points=1000, n_classes=3, noise=0.2):
self.n_points = n_points
self.n_classes = n_classes
X = []
y = []
for class_idx in range(n_classes):
# 生成螺旋数据
theta = np.linspace(class_idx * 4, (class_idx + 1) * 4, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise
r = np.linspace(0.5, 2, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise * 0.1
x1 = r * np.cos(theta)
x2 = r * np.sin(theta)
X.append(np.column_stack([x1, x2]))
y.append(np.full(n_points // n_classes, class_idx))
self.X = torch.FloatTensor(np.vstack(X))
self.y = torch.LongTensor(np.hstack(y))
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
# 定义神经网络模型
class NeuralNetwork(nn.Module):
def __init__(self, input_size=2, hidden_sizes=[64, 32], output_size=3):
super(NeuralNetwork, self).__init__()
layers = []
prev_size = input_size
for hidden_size in hidden_sizes:
layers.append(nn.Linear(prev_size, hidden_size))
layers.append(nn.ReLU())
layers.append(nn.Dropout(0.2))
prev_size = hidden_size
layers.append(nn.Linear(prev_size, output_size))
self.network = nn.Sequential(*layers)
def forward(self, x):
return self.network(x)
# 训练函数
def train_model(model, train_loader, val_loader, epochs=100, lr=0.01):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
train_losses = []
val_losses = []
train_accs = []
val_accs = []
for epoch in range(epochs):
# 训练阶段
model.train()
train_loss = 0
correct = 0
total = 0
for batch_x, batch_y in train_loader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
# 前向传播
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
train_loss /= len(train_loader)
train_acc = 100 * correct / total
train_losses.append(train_loss)
train_accs.append(train_acc)
# 验证阶段
model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_x, batch_y in val_loader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
val_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
val_loss /= len(val_loader)
val_acc = 100 * correct / total
val_losses.append(val_loss)
val_accs.append(val_acc)
scheduler.step(val_loss)
if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch+1}/{epochs}], '
f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
return train_losses, val_losses, train_accs, val_accs
# 创建数据集
dataset = SpiralDataset(n_points=3000, n_classes=3, noise=0.2)
# 划分训练集和验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# 创建模型
model = NeuralNetwork(input_size=2, hidden_sizes=[128, 64, 32], output_size=3)
print(f"\n模型结构:\n{model}")
# 训练模型
print("\n开始训练...")
train_losses, val_losses, train_accs, val_accs = train_model(
model, train_loader, val_loader, epochs=100, lr=0.001
)
# 可视化结果
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
# 绘制数据分布
ax = axes[0, 0]
colors = ['red', 'blue', 'green']
for i in range(3):
mask = dataset.y == i
ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1],
c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('原始数据分布')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()
# 绘制决策边界
ax = axes[0, 1]
model.eval()
h = 0.02
x_min, x_max = dataset.X[:, 0].min() - 0.5, dataset.X[:, 0].max() + 0.5
y_min, y_max = dataset.X[:, 1].min() - 0.5, dataset.X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
with torch.no_grad():
Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]).to(device))
_, Z = torch.max(Z, 1)
Z = Z.cpu().numpy().reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
for i in range(3):
mask = dataset.y == i
ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1],
c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('学习到的决策边界')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()
# 绘制训练损失
ax = axes[0, 2]
ax.plot(train_losses, label='训练损失', alpha=0.8)
ax.plot(val_losses, label='验证损失', alpha=0.8)
ax.set_title('损失曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
ax.grid(True, alpha=0.3)
# 绘制准确率
ax = axes[1, 0]
ax.plot(train_accs, label='训练准确率', alpha=0.8)
ax.plot(val_accs, label='验证准确率', alpha=0.8)
ax.set_title('准确率曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy (%)')
ax.legend()
ax.grid(True, alpha=0.3)
# 绘制梯度流
ax = axes[1, 1]
gradients = []
for name, param in model.named_parameters():
if param.grad is not None:
gradients.append(param.grad.abs().mean().item())
ax.bar(range(len(gradients)), gradients, color='skyblue', alpha=0.7)
ax.set_title('各层梯度大小')
ax.set_xlabel('层索引')
ax.set_ylabel('梯度绝对值均值')
ax.grid(True, alpha=0.3)
# 绘制权重分布
ax = axes[1, 2]
weights = []
for param in model.parameters():
if len(param.shape) > 1: # 只考虑权重矩阵,不考虑偏置
weights.extend(param.cpu().detach().numpy().flatten())
ax.hist(weights, bins=50, color='purple', alpha=0.7, edgecolor='black')
ax.set_title('权重分布')
ax.set_xlabel('权重值')
ax.set_ylabel('频数')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
使用设备: cuda
模型结构:
NeuralNetwork(
(network): Sequential(
(0): Linear(in_features=2, out_features=128, bias=True)
(1): ReLU()
(2): Dropout(p=0.2, inplace=False)
(3): Linear(in_features=128, out_features=64, bias=True)
(4): ReLU()
(5): Dropout(p=0.2, inplace=False)
(6): Linear(in_features=64, out_features=32, bias=True)
(7): ReLU()
(8): Dropout(p=0.2, inplace=False)
(9): Linear(in_features=32, out_features=3, bias=True)
)
)
开始训练...
Epoch [10/100], Train Loss: 0.0451, Train Acc: 98.88%, Val Loss: 0.0076, Val Acc: 99.83%
Epoch [20/100], Train Loss: 0.0203, Train Acc: 99.42%, Val Loss: 0.0011, Val Acc: 100.00%
Epoch [30/100], Train Loss: 0.0142, Train Acc: 99.50%, Val Loss: 0.0036, Val Acc: 99.83%
Epoch [40/100], Train Loss: 0.0097, Train Acc: 99.71%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [50/100], Train Loss: 0.0061, Train Acc: 99.83%, Val Loss: 0.0000, Val Acc: 100.00%
Epoch [60/100], Train Loss: 0.0088, Train Acc: 99.54%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [70/100], Train Loss: 0.0037, Train Acc: 99.83%, Val Loss: 0.0001, Val Acc: 100.00%
Epoch [80/100], Train Loss: 0.0063, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [90/100], Train Loss: 0.0111, Train Acc: 99.88%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [100/100], Train Loss: 0.0048, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%
7.2 计算图可视化工具
def visualize_computation_graph_detailed(model, input_size=(1, 2)):
"""详细可视化模型的计算图"""
# 创建示例输入
x = torch.randn(input_size, requires_grad=True)
# 前向传播
y = model(x)
# 创建一个简单的损失
loss = y.sum()
# 执行反向传播以构建完整的计算图
loss.backward()
print("计算图构建完成")
print(f"输入形状: {x.shape}")
print(f"输出形状: {y.shape}")
print(f"损失值: {loss.item():.4f}")
# 打印梯度信息
print("\n各层参数的梯度:")
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: 梯度形状={param.grad.shape}, "
f"梯度均值={param.grad.mean().item():.6f}, "
f"梯度标准差={param.grad.std().item():.6f}")
# 使用示例
small_model = NeuralNetwork(input_size=2, hidden_sizes=[4, 3], output_size=2)
visualize_computation_graph_detailed(small_model)
计算图构建完成
输入形状: torch.Size([1, 2])
输出形状: torch.Size([1, 2])
损失值: 0.8725
各层参数的梯度:
network.0.weight: 梯度形状=torch.Size([4, 2]), 梯度均值=0.000000, 梯度标准差=0.000000
network.0.bias: 梯度形状=torch.Size([4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.weight: 梯度形状=torch.Size([3, 4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.bias: 梯度形状=torch.Size([3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.weight: 梯度形状=torch.Size([2, 3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.bias: 梯度形状=torch.Size([2]), 梯度均值=1.000000, 梯度标准差=0.000000
- 张量操作:PyTorch的基础数据结构,支持CPU和GPU计算
- 自动求导:通过计算图自动计算梯度,是深度学习的核心
- 反向传播:基于链式法则的梯度计算方法
- 设备管理:灵活的GPU/CPU切换,提高计算效率
- 实践应用:从理论到实践的完整深度学习流程
最佳实践
- 始终使用
device
变量编写设备无关的代码 - 合理使用
requires_grad
控制梯度计算 - 注意梯度累积,必要时清零梯度
- 使用批量操作替代循环以提高性能
- 监控梯度大小,避免梯度消失或爆炸
- 深入学习优化器的工作原理
- 探索更复杂的网络架构(CNN、RNN、Transformer)
- 学习分布式训练和混合精度训练
- 掌握模型部署和优化技术