import torch
import torch.nn as nn
import torch.nn.functional as F
def print_shape(name, x):
print(f"{name:<12}: {tuple(x.shape)}")
class DoubleConv(nn.Module):
def __init__(self, in_c, out_c):
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_c, out_c, 3, padding=1, bias=False),
nn.BatchNorm2d(out_c),
nn.ReLU(inplace=True),
nn.Conv2d(out_c, out_c, 3, padding=1, bias=False),
nn.BatchNorm2d(out_c),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.conv(x)
class Down(nn.Module):
def __init__(self, in_c, out_c):
super().__init__()
self.pool_conv = nn.Sequential(
nn.MaxPool2d(2),
DoubleConv(in_c, out_c)
)
def forward(self, x):
return self.pool_conv(x)
class Up(nn.Module):
def __init__(self, in_c, out_c, bilinear=True):
super().__init__()
if bilinear:
self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.conv = DoubleConv(in_c, out_c)
else:
self.up = nn.ConvTranspose2d(in_c, in_c//2, 2, stride=2)
self.conv = DoubleConv(in_c, out_c)
def forward(self, x1, x2):
x1 = self.up(x1)
diffY = x2.size(2) - x1.size(2)
diffX = x2.size(3) - x1.size(3)
x1 = F.pad(x1, [diffX//2, diffX-diffX//2,
diffY//2, diffY-diffY//2])
x = torch.cat([x2, x1], dim=1)
return self.conv(x)
class OutConv(nn.Module):
def __init__(self, in_c, n_classes):
super().__init__()
self.conv = nn.Conv2d(in_c, n_classes, 1)
def forward(self, x):
return self.conv(x)
class UNet(nn.Module):
def __init__(self, n_channels=3, n_classes=1, bilinear=True):
super().__init__()
self.inc = DoubleConv(n_channels, 64)
self.down1 = Down(64, 128)
self.down2 = Down(128, 256)
self.down3 = Down(256, 512)
factor = 2 if bilinear else 1
self.down4 = Down(512, 1024 // factor)
self.up1 = Up(1024, 512 // factor, bilinear)
self.up2 = Up(512, 256 // factor, bilinear)
self.up3 = Up(256, 128 // factor, bilinear)
self.up4 = Up(128, 64, bilinear)
self.outc = OutConv(64, n_classes)
def forward(self, x):
print_shape("input", x)
x1 = self.inc(x) ; print_shape("x1", x1)
x2 = self.down1(x1) ; print_shape("x2", x2)
x3 = self.down2(x2) ; print_shape("x3", x3)
x4 = self.down3(x3) ; print_shape("x4", x4)
x5 = self.down4(x4) ; print_shape("x5", x5)
x = self.up1(x5, x4) ; print_shape("up1", x)
x = self.up2(x, x3) ; print_shape("up2", x)
x = self.up3(x, x2) ; print_shape("up3", x)
x = self.up4(x, x1) ; print_shape("up4", x)
logits = self.outc(x) ; print_shape("logits", logits)
return logits
if __name__ == "__main__":
model = UNet()
print(model)
input = torch.randn(2,3,256,256)
_ = model(input)
input : (2, 3, 256, 256)
x1 : (2, 64, 256, 256)
x2 : (2, 128, 128, 128)
x3 : (2, 256, 64, 64)
x4 : (2, 512, 32, 32)
x5 : (2, 512, 16, 16)
up1 : (2, 256, 32, 32)
up2 : (2, 128, 64, 64)
up3 : (2, 64, 128, 128)
up4 : (2, 64, 256, 256)
logits : (2, 1, 256, 256)

- nn.MaxPool2d(2)里面是几,就是原来的几分之一,因为默认没有填充,步长等于kernel_size,这里就是尺寸变为原来的一半,输入特征图大小为 (4, 4) → 输出特征图大小为 (2, 2)。
- 原论文padding=0,所以每做一次 3×3 卷积,特征图宽/高各减少 2 像素,这里直接保持尺寸不变了
- 加factor为了统一双线性插值和转置卷积,转置卷积
nn.ConvTranspose2d(...)
会使通道数减半,双线性插值不会。现在做上采样(Upsampling)时,“先插值再卷积” 已经成了最常用、最稳妥的方案,其次是转置卷积(Transpose Convolution),如果在做 U-Net 或通用语义分割,直接采用 双线性插值 + 卷积 就行;如果做 GAN 或需要学习式上采样,再考虑转置卷积。
nn.ReLU(inplace=True)
直接在原始输入张量上进行修改,而不是创建一个新的张量来存储结果。这样可以节省内存
nn.Conv2d(in_c, out_c, 3, padding=1, bias=False)
bias=False是不为卷积层添加可学习的偏置项。BatchNorm 层本身包含一个可学习的偏置参数 β,可以替代卷积层的偏置。在现代 CNN 架构中,如果卷积层后紧跟 BatchNorm,通常都会设置 bias=False。