Split View: CNN 아키텍처 완전 정복: LeNet부터 EfficientNet, Vision Transformer까지
CNN 아키텍처 완전 정복: LeNet부터 EfficientNet, Vision Transformer까지
CNN 아키텍처 완전 정복
컨볼루션 신경망(CNN, Convolutional Neural Network)은 컴퓨터 비전 혁명의 핵심입니다. 1998년 LeNet의 등장부터 2022년 ConvNeXt와 Vision Transformer까지, CNN 아키텍처는 놀라운 속도로 발전해 왔습니다. 이 가이드에서는 주요 CNN 아키텍처의 구조적 혁신을 이해하고, PyTorch로 직접 구현하는 방법을 완전히 마스터합니다.
1. CNN 기초
합성곱 연산 직관적 이해
합성곱(Convolution)은 이미지의 지역적 패턴을 추출하는 연산입니다. 작은 필터(커널)가 이미지 위를 슬라이딩하며 특징 맵(Feature Map)을 생성합니다.
입력 이미지 (5x5) 커널 (3x3) 출력 특징 맵 (3x3)
1 1 1 0 0 1 0 1 4 3 4
0 1 1 1 0 * 0 1 0 = 2 4 3
0 0 1 1 1 1 0 1 2 3 4
0 0 1 1 0
0 1 1 0 0
각 위치에서 커널과 이미지 패치의 원소별 곱의 합이 출력 특징 맵의 값이 됩니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
# 합성곱 연산 시각화
def visualize_convolution():
# 샘플 이미지
image = torch.tensor([[
[1., 1., 1., 0., 0.],
[0., 1., 1., 1., 0.],
[0., 0., 1., 1., 1.],
[0., 0., 1., 1., 0.],
[0., 1., 1., 0., 0.]
]]).unsqueeze(0) # (1, 1, 5, 5)
# 엣지 감지 커널
edge_kernel = torch.tensor([[
[[-1., -1., -1.],
[-1., 8., -1.],
[-1., -1., -1.]]
]]) # (1, 1, 3, 3)
output = F.conv2d(image, edge_kernel, padding=1)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(image[0, 0].numpy(), cmap='gray')
axes[0].set_title('입력 이미지')
axes[1].imshow(edge_kernel[0, 0].numpy(), cmap='RdYlBu')
axes[1].set_title('엣지 감지 커널')
axes[2].imshow(output[0, 0].detach().numpy(), cmap='gray')
axes[2].set_title('출력 특징 맵')
plt.tight_layout()
plt.show()
visualize_convolution()
커널/필터, 스트라이드, 패딩
import torch
import torch.nn as nn
# 기본 Conv2d 파라미터
conv = nn.Conv2d(
in_channels=3, # 입력 채널 수 (RGB=3)
out_channels=64, # 출력 채널 수 (필터 수)
kernel_size=3, # 커널 크기 (3x3)
stride=1, # 스트라이드
padding=1, # 패딩 (same padding)
bias=True
)
# 출력 크기 계산 공식
# H_out = floor((H_in + 2*padding - kernel_size) / stride + 1)
# W_out = floor((W_in + 2*padding - kernel_size) / stride + 1)
def calc_output_size(input_size, kernel_size, stride, padding):
return (input_size + 2 * padding - kernel_size) // stride + 1
# 예시
print(calc_output_size(224, 3, 1, 1)) # 224 (same padding)
print(calc_output_size(224, 3, 2, 1)) # 112 (stride 2로 절반)
print(calc_output_size(224, 7, 2, 3)) # 112 (AlexNet 첫 레이어)
# 파라미터 수 계산
# Conv2d: (kernel_h * kernel_w * in_channels + 1) * out_channels
params = (3 * 3 * 3 + 1) * 64
print(f"Conv(3->64, 3x3) 파라미터 수: {params:,}") # 1,792
Pooling (Max, Average, Global)
import torch
import torch.nn as nn
import torch.nn.functional as F
x = torch.randn(1, 64, 28, 28)
# Max Pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_max = max_pool(x) # (1, 64, 14, 14)
# Average Pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
out_avg = avg_pool(x) # (1, 64, 14, 14)
# Global Average Pooling (GAP) - 공간 차원을 1x1로 축소
gap = nn.AdaptiveAvgPool2d(1)
out_gap = gap(x) # (1, 64, 1, 1)
out_gap_flat = out_gap.flatten(1) # (1, 64)
# Adaptive Pooling - 출력 크기 지정
adaptive = nn.AdaptiveAvgPool2d((7, 7))
out_adaptive = adaptive(x) # (1, 64, 7, 7) - 어떤 입력 크기도 OK
print(f"입력: {x.shape}")
print(f"MaxPool: {out_max.shape}")
print(f"GAP: {out_gap_flat.shape}")
Receptive Field 계산
def calculate_receptive_field(layers):
"""
각 레이어의 수용 영역 계산
layers: [(kernel_size, stride, dilation), ...]
"""
rf = 1
jump = 1
for k, s, d in layers:
effective_k = d * (k - 1) + 1
rf = rf + (effective_k - 1) * jump
jump = jump * s
return rf
# VGG 스타일 (3x3 conv만 사용)
vgg_layers = [
(3, 1, 1), # conv1
(3, 1, 1), # conv2
(2, 2, 1), # pool
(3, 1, 1), # conv3
(3, 1, 1), # conv4
(2, 2, 1), # pool
]
rf = calculate_receptive_field(vgg_layers)
print(f"VGG 6개 레이어 후 수용 영역: {rf}x{rf} 픽셀")
# 참고: 3x3 두 개 = 5x5 하나와 동일한 수용 영역
# 하지만 파라미터는 2*(9*C^2) vs 25*C^2 → 3x3 두 개가 더 효율적
2. CNN 발전사
LeNet-5 (1998, LeCun) - 최초의 실용적 CNN
LeNet-5는 Yann LeCun이 1998년 개발한 최초의 실용적 CNN으로, 손으로 쓴 숫자 인식(MNIST)에 사용되었습니다.
구조: Input(32x32) → C1(conv, 6@28x28) → S2(pool, 6@14x14) → C3(conv, 16@10x10) → S4(pool, 16@5x5) → C5(conv, 120@1x1) → F6(fc, 84) → Output(10)
import torch
import torch.nn as nn
class LeNet5(nn.Module):
"""LeNet-5 구현 (원본에 ReLU 추가)"""
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
self.features = nn.Sequential(
# C1: 1@32x32 -> 6@28x28
nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S2: 6@28x28 -> 6@14x14
nn.AvgPool2d(kernel_size=2, stride=2),
# C3: 6@14x14 -> 16@10x10
nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S4: 16@10x10 -> 16@5x5
nn.AvgPool2d(kernel_size=2, stride=2),
# C5: 16@5x5 -> 120@1x1
nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
)
self.classifier = nn.Sequential(
# F6: 120 -> 84
nn.Linear(120, 84),
nn.Tanh(),
# Output: 84 -> num_classes
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.flatten(1)
x = self.classifier(x)
return x
# 테스트
model = LeNet5(num_classes=10)
x = torch.randn(4, 1, 32, 32)
out = model(x)
print(f"LeNet-5 출력: {out.shape}") # (4, 10)
# 파라미터 수
total_params = sum(p.numel() for p in model.parameters())
print(f"LeNet-5 총 파라미터: {total_params:,}") # ~60,000
AlexNet (2012, Krizhevsky) - 딥러닝 르네상스의 시작
AlexNet은 2012년 ImageNet 대회에서 top-5 오류율 15.3%를 달성하며 기존 최고 성능(26.2%)을 크게 앞서 딥러닝 시대를 열었습니다.
핵심 혁신:
- ReLU 활성화 함수 도입 (Tanh 대비 6배 빠른 학습)
- Dropout (0.5)으로 과적합 방지
- Data Augmentation (크롭, 반전)
- Local Response Normalization (LRN)
- GPU 2개 병렬 학습
import torch
import torch.nn as nn
class AlexNet(nn.Module):
"""AlexNet 구현"""
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
# Layer 1: 3@224x224 -> 96@55x55
nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 96@27x27
# Layer 2: 96@27x27 -> 256@27x27
nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@13x13
# Layer 3: 256@13x13 -> 384@13x13
nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 4: 384@13x13 -> 384@13x13
nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 5: 384@13x13 -> 256@13x13
nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@6x6
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
model = AlexNet(num_classes=1000)
x = torch.randn(4, 3, 224, 224)
out = model(x)
print(f"AlexNet 출력: {out.shape}") # (4, 1000)
total_params = sum(p.numel() for p in model.parameters())
print(f"AlexNet 총 파라미터: {total_params:,}") # ~61M
VGGNet (2014, Simonyan) - 깊이의 힘
VGGNet은 Oxford의 Visual Geometry Group이 개발했습니다. 핵심 인사이트는 모든 합성곱 레이어에 3x3 커널만 사용하여 깊이를 극적으로 늘린 것입니다.
왜 3x3인가?
- 3x3 두 개 = 5x5 하나의 수용 영역 (파라미터 수는 2×9C² vs 25C², 28% 절약)
- 3x3 세 개 = 7x7 하나의 수용 영역 (파라미터 수는 3×9C² vs 49C², 45% 절약)
- 더 많은 비선형 변환으로 표현력 향상
import torch
import torch.nn as nn
from typing import List, Union
class VGG(nn.Module):
"""VGG 범용 구현"""
def __init__(self, features: nn.Module, num_classes: int = 1000, dropout: float = 0.5):
super(VGG, self).__init__()
self.features = features
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, num_classes)
)
self._initialize_weights()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
"""설정 목록에서 VGG 레이어 생성"""
layers: List[nn.Module] = []
in_channels = 3
for v in cfg:
if v == 'M':
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
else:
v = int(v)
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
# VGG 설정
cfgs = {
'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
def vgg16(num_classes=1000):
return VGG(make_layers(cfgs['vgg16'], batch_norm=True), num_classes=num_classes)
def vgg19(num_classes=1000):
return VGG(make_layers(cfgs['vgg19'], batch_norm=True), num_classes=num_classes)
# 테스트
model_vgg16 = vgg16()
x = torch.randn(2, 3, 224, 224)
out = model_vgg16(x)
print(f"VGG-16 출력: {out.shape}")
params_vgg16 = sum(p.numel() for p in model_vgg16.parameters())
print(f"VGG-16 파라미터: {params_vgg16:,}") # ~138M
GoogLeNet/Inception (2014, Szegedy) - 병렬 다중 스케일 처리
Inception 모듈의 핵심은 동일한 레이어에서 서로 다른 크기의 커널(1x1, 3x3, 5x5)을 병렬로 처리하여 다양한 스케일의 특징을 동시에 추출하는 것입니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
class InceptionModule(nn.Module):
"""기본 Inception 모듈"""
def __init__(self, in_channels, n1x1, n3x3_reduce, n3x3,
n5x5_reduce, n5x5, pool_proj):
super(InceptionModule, self).__init__()
# 1x1 branch
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, n1x1, kernel_size=1),
nn.BatchNorm2d(n1x1),
nn.ReLU(inplace=True)
)
# 3x3 branch (1x1 bottleneck + 3x3)
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, n3x3_reduce, kernel_size=1),
nn.BatchNorm2d(n3x3_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
nn.BatchNorm2d(n3x3),
nn.ReLU(inplace=True)
)
# 5x5 branch (1x1 bottleneck + 5x5)
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, n5x5_reduce, kernel_size=1),
nn.BatchNorm2d(n5x5_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n5x5_reduce, n5x5, kernel_size=5, padding=2),
nn.BatchNorm2d(n5x5),
nn.ReLU(inplace=True)
)
# Pool branch (MaxPool + 1x1)
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, pool_proj, kernel_size=1),
nn.BatchNorm2d(pool_proj),
nn.ReLU(inplace=True)
)
def forward(self, x):
b1 = self.branch1(x)
b2 = self.branch2(x)
b3 = self.branch3(x)
b4 = self.branch4(x)
# 채널 차원으로 concat
return torch.cat([b1, b2, b3, b4], dim=1)
# 테스트
module = InceptionModule(192, 64, 96, 128, 16, 32, 32)
x = torch.randn(2, 192, 28, 28)
out = module(x)
print(f"Inception 출력: {out.shape}") # (2, 256, 28, 28) = 64+128+32+32
ResNet (2015, He) - 잔차 연결로 기울기 소실 해결
ResNet은 He Kaiming이 2015년 발표한 혁신적 아키텍처입니다. 잔차 연결(Skip Connection)을 통해 기울기가 깊은 레이어까지 전달되어 152층의 매우 깊은 네트워크를 학습할 수 있게 되었습니다.
핵심 아이디어: H(x) = F(x) + x
레이어가 H(x)를 직접 학습하는 대신 잔차 F(x) = H(x) - x를 학습합니다. 최적 함수가 항등 함수에 가까울 때, 잔차를 0으로 만드는 것이 더 쉽습니다.
import torch
import torch.nn as nn
from typing import Optional, Type, List
class BasicBlock(nn.Module):
"""ResNet-18/34용 기본 블록"""
expansion = 1
def __init__(self, in_channels: int, out_channels: int,
stride: int = 1, downsample: Optional[nn.Module] = None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample # 차원 맞추기 용
def forward(self, x: torch.Tensor) -> torch.Tensor:
identity = x # 잔차 연결을 위해 입력 저장
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# 차원이 다른 경우 프로젝션
if self.downsample is not None:
identity = self.downsample(x)
out += identity # 핵심: 잔차 연결
out = self.relu(out)
return out
class Bottleneck(nn.Module):
"""ResNet-50/101/152용 병목 블록"""
expansion = 4
def __init__(self, in_channels: int, out_channels: int,
stride: int = 1, downsample: Optional[nn.Module] = None):
super(Bottleneck, self).__init__()
# 1x1 conv (채널 축소)
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
# 3x3 conv (공간 처리)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 1x1 conv (채널 확장: out_channels * 4)
self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x: torch.Tensor) -> torch.Tensor:
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
"""완전한 ResNet 구현"""
def __init__(self, block: Type[nn.Module], layers: List[int],
num_classes: int = 1000):
super(ResNet, self).__init__()
self.in_channels = 64
# 스템
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 4개의 Stage
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# 분류기
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
self._initialize_weights()
def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion)
)
layers = [block(self.in_channels, out_channels, stride, downsample)]
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 스템
x = self.maxpool(self.relu(self.bn1(self.conv1(x))))
# 4개 Stage
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
# 분류
x = self.avgpool(x)
x = x.flatten(1)
x = self.fc(x)
return x
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
# 테스트
for name, model_fn in [('ResNet-18', resnet18), ('ResNet-50', resnet50)]:
model = model_fn()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"{name}: 출력={out.shape}, 파라미터={params:,}")
DenseNet (2017, Huang) - 조밀한 연결
DenseNet은 각 레이어를 이전의 모든 레이어와 연결합니다. L개의 레이어가 있을 때 ResNet은 L개의 연결이지만 DenseNet은 L(L+1)/2개의 연결이 생깁니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
class DenseLayer(nn.Module):
"""DenseNet의 단일 레이어"""
def __init__(self, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseLayer, self).__init__()
# Bottleneck: 1x1 conv로 채널 수 제한
self.norm1 = nn.BatchNorm2d(in_channels)
self.relu1 = nn.ReLU(inplace=True)
self.conv1 = nn.Conv2d(in_channels, bn_size * growth_rate,
kernel_size=1, bias=False)
# 3x3 conv
self.norm2 = nn.BatchNorm2d(bn_size * growth_rate)
self.relu2 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, padding=1, bias=False)
self.drop_rate = drop_rate
def forward(self, x):
if isinstance(x, torch.Tensor):
prev_features = [x]
else:
prev_features = x
# 모든 이전 특징 맵을 concat
concat_input = torch.cat(prev_features, dim=1)
out = self.conv1(self.relu1(self.norm1(concat_input)))
out = self.conv2(self.relu2(self.norm2(out)))
if self.drop_rate > 0:
out = F.dropout(out, p=self.drop_rate, training=self.training)
return out
class DenseBlock(nn.Module):
"""여러 DenseLayer로 구성된 Dense Block"""
def __init__(self, num_layers, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseBlock, self).__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
layer = DenseLayer(
in_channels + i * growth_rate,
growth_rate, bn_size, drop_rate
)
self.layers.append(layer)
def forward(self, x):
features = [x]
for layer in self.layers:
new_feat = layer(features)
features.append(new_feat)
return torch.cat(features, dim=1)
class TransitionLayer(nn.Module):
"""Dense Block 사이의 전환 레이어 (채널 수 조정 + 다운샘플링)"""
def __init__(self, in_channels, out_channels):
super(TransitionLayer, self).__init__()
self.norm = nn.BatchNorm2d(in_channels)
self.relu = nn.ReLU(inplace=True)
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
def forward(self, x):
return self.pool(self.conv(self.relu(self.norm(x))))
class DenseNet121(nn.Module):
"""DenseNet-121 구현"""
def __init__(self, num_classes=1000, growth_rate=32, num_init_features=64):
super(DenseNet121, self).__init__()
# DenseNet-121 구성: 6, 12, 24, 16 레이어
block_config = [6, 12, 24, 16]
compression = 0.5
# 스템
self.features = nn.Sequential(
nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(num_init_features),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
# Dense Block + Transition Layer
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = DenseBlock(num_layers, num_features, growth_rate)
self.features.add_module(f'denseblock{i+1}', block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1: # 마지막 블록 제외
out_features = int(num_features * compression)
transition = TransitionLayer(num_features, out_features)
self.features.add_module(f'transition{i+1}', transition)
num_features = out_features
# 최종 BN
self.features.add_module('norm_final', nn.BatchNorm2d(num_features))
self.features.add_module('relu_final', nn.ReLU(inplace=True))
# 분류기
self.classifier = nn.Linear(num_features, num_classes)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
def forward(self, x):
features = self.features(x)
out = self.avgpool(features)
out = out.flatten(1)
out = self.classifier(out)
return out
# 테스트
model = DenseNet121(num_classes=1000)
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"DenseNet-121 출력: {out.shape}, 파라미터: {params:,}")
MobileNet (2017) - 경량화 혁명
MobileNet은 Depthwise Separable Convolution을 도입하여 모바일/엣지 디바이스에서 실행할 수 있는 경량 CNN을 구현했습니다.
import torch
import torch.nn as nn
class DepthwiseSeparableConv(nn.Module):
"""Depthwise Separable Convolution"""
def __init__(self, in_channels, out_channels, stride=1):
super(DepthwiseSeparableConv, self).__init__()
# Depthwise: 각 입력 채널을 독립적으로 처리
self.depthwise = nn.Sequential(
nn.Conv2d(in_channels, in_channels, kernel_size=3,
stride=stride, padding=1, groups=in_channels, bias=False),
nn.BatchNorm2d(in_channels),
nn.ReLU6(inplace=True)
)
# Pointwise: 1x1 conv로 채널 결합
self.pointwise = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU6(inplace=True)
)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
class InvertedResidual(nn.Module):
"""MobileNetV2의 역 잔차 블록"""
def __init__(self, in_channels, out_channels, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
hidden_dim = int(in_channels * expand_ratio)
self.use_res_connect = (stride == 1 and in_channels == out_channels)
layers = []
if expand_ratio != 1:
# Expand
layers += [
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True)
]
layers += [
# Depthwise
nn.Conv2d(hidden_dim, hidden_dim, 3, stride=stride,
padding=1, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# Project
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels)
]
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
"""MobileNetV2 구현"""
def __init__(self, num_classes=1000, width_mult=1.0):
super(MobileNetV2, self).__init__()
# t=expand_ratio, c=out_channels, n=num_layers, s=stride
inverted_residual_settings = [
[1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
]
input_channel = int(32 * width_mult)
last_channel = int(1280 * max(1.0, width_mult))
features = [
nn.Sequential(
nn.Conv2d(3, input_channel, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(input_channel),
nn.ReLU6(inplace=True)
)
]
for t, c, n, s in inverted_residual_settings:
output_channel = int(c * width_mult)
for i in range(n):
stride = s if i == 0 else 1
features.append(
InvertedResidual(input_channel, output_channel, stride, expand_ratio=t)
)
input_channel = output_channel
features.append(nn.Sequential(
nn.Conv2d(input_channel, last_channel, 1, bias=False),
nn.BatchNorm2d(last_channel),
nn.ReLU6(inplace=True)
))
self.features = nn.Sequential(*features)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(last_channel, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
# 파라미터 비교
standard_conv_params = 3 * 3 * 512 * 512 # 표준 합성곱
dw_sep_params = (3 * 3 * 512) + (512 * 512) # Depthwise Separable
print(f"표준 합성곱: {standard_conv_params:,}")
print(f"Depthwise Separable: {dw_sep_params:,}")
print(f"절감 비율: {(1 - dw_sep_params/standard_conv_params):.1%}")
EfficientNet (2019, Tan) - 복합 스케일링
EfficientNet은 폭(Width), 깊이(Depth), 해상도(Resolution)를 복합적으로 스케일링하는 방법을 제안했습니다.
import torch
import torch.nn as nn
import math
class MBConvBlock(nn.Module):
"""EfficientNet의 MBConv 블록 (MobileNetV2 기반)"""
def __init__(self, in_channels, out_channels, kernel_size,
stride, expand_ratio, se_ratio=0.25):
super(MBConvBlock, self).__init__()
self.stride = stride
self.use_res = (stride == 1 and in_channels == out_channels)
hidden_dim = in_channels * expand_ratio
layers = []
if expand_ratio != 1:
layers += [
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
nn.SiLU()
]
# Depthwise
layers += [
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride=stride,
padding=kernel_size//2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
nn.SiLU()
]
# Squeeze and Excitation
se_channels = max(1, int(in_channels * se_ratio))
layers += [
# Squeeze
nn.AdaptiveAvgPool2d(1),
]
self.se_reduce = nn.Conv2d(hidden_dim, se_channels, 1)
self.se_expand = nn.Conv2d(se_channels, hidden_dim, 1)
self.se_act = nn.SiLU()
# Projection
self.project = nn.Sequential(
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels, momentum=0.01, eps=1e-3)
)
self.conv = nn.Sequential(*layers[:len(layers)]) # SE 제외
self._hidden_dim = hidden_dim
def forward(self, x):
identity = x
out = x
# Expand + Depthwise
for layer in self.conv:
if isinstance(layer, nn.AdaptiveAvgPool2d):
break
out = layer(out)
# SE (Squeeze-and-Excitation)
se = out.mean([2, 3], keepdim=True)
se = self.se_act(self.se_reduce(se))
se = torch.sigmoid(self.se_expand(se))
out = out * se
# Project
out = self.project(out)
if self.use_res:
out = out + identity
return out
# EfficientNet 스케일링 계수
efficientnet_params = {
'b0': (1.0, 1.0, 224, 0.2),
'b1': (1.0, 1.1, 240, 0.2),
'b2': (1.1, 1.2, 260, 0.3),
'b3': (1.2, 1.4, 300, 0.3),
'b4': (1.4, 1.8, 380, 0.4),
'b5': (1.6, 2.2, 456, 0.4),
'b6': (1.8, 2.6, 528, 0.5),
'b7': (2.0, 3.1, 600, 0.5),
}
# (width_coeff, depth_coeff, resolution, dropout_rate)
print("EfficientNet 스케일링 파라미터:")
for version, (w, d, r, drop) in efficientnet_params.items():
print(f" B{version[1]}: 폭={w:.1f}, 깊이={d:.1f}, 해상도={r}, 드롭아웃={drop}")
ConvNeXt (2022, Liu) - Modern ConvNet
ConvNeXt는 ViT의 디자인 원칙을 CNN에 적용하여 Transformer와 대등한 성능을 달성한 "modernized" ConvNet입니다.
import torch
import torch.nn as nn
class ConvNeXtBlock(nn.Module):
"""ConvNeXt 블록"""
def __init__(self, dim, layer_scale_init_value=1e-6):
super(ConvNeXtBlock, self).__init__()
# Depthwise Conv (7x7 큰 커널)
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
# LayerNorm
self.norm = nn.LayerNorm(dim, eps=1e-6)
# Inverted Bottleneck (채널 4배 확장)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
# Layer Scale
self.gamma = nn.Parameter(
layer_scale_init_value * torch.ones(dim),
requires_grad=True
) if layer_scale_init_value > 0 else None
def forward(self, x):
identity = x
x = self.dwconv(x)
# (N, C, H, W) -> (N, H, W, C) for LayerNorm
x = x.permute(0, 2, 3, 1)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
# (N, H, W, C) -> (N, C, H, W)
x = x.permute(0, 3, 1, 2)
return identity + x
class ConvNeXt(nn.Module):
"""ConvNeXt 구현"""
def __init__(self, in_channels=3, num_classes=1000,
depths=[3, 3, 9, 3], dims=[96, 192, 384, 768]):
super(ConvNeXt, self).__init__()
# 스템 (Patchify)
self.downsample_layers = nn.ModuleList()
stem = nn.Sequential(
nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
nn.LayerNorm(dims[0], eps=1e-6) if False else # 사용 안함
nn.GroupNorm(1, dims[0]) # 채널별 정규화
)
# 실제로는 더 단순하게
self.stem = nn.Sequential(
nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
)
# 4개 Stage
self.stages = nn.ModuleList()
self.downsamples = nn.ModuleList()
for i in range(4):
if i > 0:
self.downsamples.append(nn.Sequential(
nn.GroupNorm(1, dims[i-1]),
nn.Conv2d(dims[i-1], dims[i], kernel_size=2, stride=2)
))
else:
self.downsamples.append(nn.Identity())
stage = nn.Sequential(
*[ConvNeXtBlock(dims[i]) for _ in range(depths[i])]
)
self.stages.append(stage)
self.norm = nn.LayerNorm(dims[-1], eps=1e-6)
self.head = nn.Linear(dims[-1], num_classes)
def forward(self, x):
x = self.stem(x)
for i, (ds, stage) in enumerate(zip(self.downsamples, self.stages)):
if i > 0:
x = ds(x)
x = stage(x)
x = x.mean([-2, -1]) # Global Average Pooling
x = self.norm(x)
x = self.head(x)
return x
# ConvNeXt-T: depths=[3,3,9,3], dims=[96,192,384,768]
model = ConvNeXt(num_classes=1000, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768])
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ConvNeXt-T 출력: {out.shape}, 파라미터: {params:,}")
3. Vision Transformer (ViT)
ViT는 이미지를 패치로 분할하고 Transformer를 적용하는 획기적인 접근법입니다.
import torch
import torch.nn as nn
import math
class PatchEmbedding(nn.Module):
"""이미지를 패치 임베딩으로 변환"""
def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768):
super(PatchEmbedding, self).__init__()
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = (image_size // patch_size) ** 2
# 패치를 임베딩으로 변환 (단일 합성곱)
self.projection = nn.Conv2d(
in_channels, embed_dim,
kernel_size=patch_size, stride=patch_size
)
def forward(self, x):
# x: (B, C, H, W)
x = self.projection(x) # (B, embed_dim, H/patch, W/patch)
x = x.flatten(2) # (B, embed_dim, num_patches)
x = x.transpose(1, 2) # (B, num_patches, embed_dim)
return x
class MultiHeadSelfAttention(nn.Module):
"""다중 헤드 자기 주의"""
def __init__(self, embed_dim, num_heads, dropout=0.0):
super(MultiHeadSelfAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(embed_dim, embed_dim * 3)
self.proj = nn.Linear(embed_dim, embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, N, C = x.shape
# Q, K, V 생성
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # 각각 (B, heads, N, head_dim)
# 어텐션 가중치
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.dropout(attn)
# 가중합
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class TransformerBlock(nn.Module):
"""Transformer 블록"""
def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.0):
super(TransformerBlock, self).__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
mlp_hidden = int(embed_dim * mlp_ratio)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, mlp_hidden),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(mlp_hidden, embed_dim),
nn.Dropout(dropout)
)
def forward(self, x):
x = x + self.attn(self.norm1(x)) # 잔차 연결
x = x + self.mlp(self.norm2(x)) # 잔차 연결
return x
class VisionTransformer(nn.Module):
"""Vision Transformer (ViT) 구현"""
def __init__(self, image_size=224, patch_size=16, in_channels=3,
num_classes=1000, embed_dim=768, depth=12, num_heads=12,
mlp_ratio=4.0, dropout=0.0):
super(VisionTransformer, self).__init__()
# 패치 임베딩
self.patch_embed = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)
num_patches = self.patch_embed.num_patches
# CLS 토큰 + 위치 임베딩
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embedding = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim)
)
self.pos_dropout = nn.Dropout(dropout)
# Transformer 블록
self.blocks = nn.Sequential(*[
TransformerBlock(embed_dim, num_heads, mlp_ratio, dropout)
for _ in range(depth)
])
self.norm = nn.LayerNorm(embed_dim)
self.head = nn.Linear(embed_dim, num_classes)
self._init_weights()
def _init_weights(self):
nn.init.trunc_normal_(self.pos_embedding, std=0.02)
nn.init.trunc_normal_(self.cls_token, std=0.02)
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.zeros_(m.bias)
def forward(self, x):
B = x.shape[0]
# 패치 임베딩
x = self.patch_embed(x) # (B, num_patches, embed_dim)
# CLS 토큰 추가
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat([cls_tokens, x], dim=1) # (B, num_patches+1, embed_dim)
# 위치 임베딩 추가
x = x + self.pos_embedding
x = self.pos_dropout(x)
# Transformer 처리
x = self.blocks(x)
x = self.norm(x)
# CLS 토큰으로 분류
cls_output = x[:, 0]
logits = self.head(cls_output)
return logits
# ViT 변형들
def vit_small(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=384, depth=12,
num_heads=6, num_classes=num_classes
)
def vit_base(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=768, depth=12,
num_heads=12, num_classes=num_classes
)
def vit_large(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=1024, depth=24,
num_heads=16, num_classes=num_classes
)
# 테스트
model = vit_base()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ViT-Base 출력: {out.shape}, 파라미터: {params:,}")
4. Object Detection 아키텍처
YOLO 계열 간단 구현
import torch
import torch.nn as nn
class YOLOHead(nn.Module):
"""YOLO 감지 헤드 (단순화 버전)"""
def __init__(self, in_channels, num_anchors, num_classes):
super(YOLOHead, self).__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
# 예측: (x, y, w, h, objectness, num_classes) * num_anchors
out_channels = num_anchors * (5 + num_classes)
self.head = nn.Sequential(
nn.Conv2d(in_channels, in_channels * 2, kernel_size=3, padding=1),
nn.BatchNorm2d(in_channels * 2),
nn.LeakyReLU(0.1),
nn.Conv2d(in_channels * 2, out_channels, kernel_size=1)
)
def forward(self, x):
out = self.head(x)
B, C, H, W = out.shape
# (B, num_anchors, H, W, 5+classes)
out = out.reshape(B, self.num_anchors, 5 + self.num_classes, H, W)
out = out.permute(0, 1, 3, 4, 2).contiguous()
return out
# 간단한 YOLOv1 스타일 모델
class SimpleYOLO(nn.Module):
def __init__(self, backbone, num_classes=80, num_boxes=2):
super(SimpleYOLO, self).__init__()
self.backbone = backbone
self.num_classes = num_classes
self.num_boxes = num_boxes
# 예측 헤드: grid_size x grid_size x (B*5 + C)
self.head = nn.Sequential(
nn.AdaptiveAvgPool2d((7, 7)),
nn.Flatten(),
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 7 * 7 * (num_boxes * 5 + num_classes))
)
def forward(self, x):
features = self.backbone(x)
out = self.head(features)
out = out.reshape(-1, 7, 7, self.num_boxes * 5 + self.num_classes)
return out
5. Image Segmentation: U-Net
import torch
import torch.nn as nn
import torch.nn.functional as F
class DoubleConv(nn.Module):
"""U-Net의 이중 합성곱 블록"""
def __init__(self, in_channels, out_channels):
super(DoubleConv, self).__init__()
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class UNet(nn.Module):
"""U-Net 구현 (의료 이미지 분할)"""
def __init__(self, in_channels=1, num_classes=2, features=[64, 128, 256, 512]):
super(UNet, self).__init__()
self.encoders = nn.ModuleList()
self.decoders = nn.ModuleList()
self.pool = nn.MaxPool2d(2, 2)
# Encoder
for feature in features:
self.encoders.append(DoubleConv(in_channels, feature))
in_channels = feature
# Bottleneck
self.bottleneck = DoubleConv(features[-1], features[-1] * 2)
# Decoder
for feature in reversed(features):
self.decoders.append(
nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2)
)
self.decoders.append(DoubleConv(feature * 2, feature))
# 최종 분류 레이어
self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)
def forward(self, x):
skip_connections = []
# Encoder
for encoder in self.encoders:
x = encoder(x)
skip_connections.append(x)
x = self.pool(x)
# Bottleneck
x = self.bottleneck(x)
skip_connections = skip_connections[::-1] # 역순
# Decoder
for i in range(0, len(self.decoders), 2):
x = self.decoders[i](x) # Upsample
skip = skip_connections[i // 2]
# 크기가 다를 경우 조정
if x.shape != skip.shape:
x = F.interpolate(x, size=skip.shape[2:])
x = torch.cat([skip, x], dim=1) # Skip Connection
x = self.decoders[i + 1](x) # DoubleConv
return self.final_conv(x)
# 테스트
model = UNet(in_channels=1, num_classes=2)
x = torch.randn(4, 1, 572, 572)
out = model(x)
print(f"U-Net 출력: {out.shape}") # (4, 2, 572, 572)
params = sum(p.numel() for p in model.parameters())
print(f"U-Net 파라미터: {params:,}")
6. 전이학습 실전 가이드
torchvision.models 활용
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import torch.optim as optim
from tqdm import tqdm
# 사전학습 모델 로드
model_resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
model_efficientnet = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)
model_vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
print(f"ResNet-50 파라미터: {sum(p.numel() for p in model_resnet.parameters()):,}")
def feature_extraction(num_classes, freeze=True):
"""Feature Extraction: 백본 동결, 분류기만 학습"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
if freeze:
for param in model.parameters():
param.requires_grad = False
# 분류기 교체
in_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(in_features, 256),
nn.ReLU(),
nn.Linear(256, num_classes)
)
# 마지막 레이어만 학습 가능
for param in model.fc.parameters():
param.requires_grad = True
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"학습 가능 파라미터: {trainable:,} / {total:,} ({trainable/total:.1%})")
return model
def fine_tuning(num_classes, unfreeze_layers=2):
"""Fine-tuning: 마지막 몇 개 레이어 학습"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
# 전체 동결
for param in model.parameters():
param.requires_grad = False
# 마지막 N개 레이어 해제
layers = [model.layer4, model.avgpool, model.fc]
for layer in layers[-unfreeze_layers:]:
for param in layer.parameters():
param.requires_grad = True
# 분류기 교체
model.fc = nn.Linear(model.fc.in_features, num_classes)
return model
# 학습 루프
def train_model(model, train_loader, val_loader, epochs=10,
learning_rate=1e-3, device='cuda'):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
# 파라미터 그룹별 학습률 설정
backbone_params = [p for n, p in model.named_parameters()
if 'fc' not in n and p.requires_grad]
head_params = [p for n, p in model.named_parameters()
if 'fc' in n and p.requires_grad]
optimizer = optim.AdamW([
{'params': backbone_params, 'lr': learning_rate * 0.1},
{'params': head_params, 'lr': learning_rate}
], weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
best_val_acc = 0.0
history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
for epoch in range(epochs):
# 학습
model.train()
train_loss, train_correct, train_total = 0.0, 0, 0
for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
# Gradient Clipping
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss += loss.item() * images.size(0)
train_correct += (outputs.argmax(1) == labels).sum().item()
train_total += images.size(0)
# 검증
model.eval()
val_loss, val_correct, val_total = 0.0, 0, 0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
val_loss += loss.item() * images.size(0)
val_correct += (outputs.argmax(1) == labels).sum().item()
val_total += images.size(0)
scheduler.step()
train_acc = train_correct / train_total
val_acc = val_correct / val_total
epoch_train_loss = train_loss / train_total
epoch_val_loss = val_loss / val_total
history['train_loss'].append(epoch_train_loss)
history['val_loss'].append(epoch_val_loss)
history['train_acc'].append(train_acc)
history['val_acc'].append(val_acc)
print(f"에폭 {epoch+1}: Train={train_acc:.4f}, Val={val_acc:.4f}")
# 최적 모델 저장
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_model.pt')
print(f"최고 검증 정확도: {best_val_acc:.4f}")
return model, history
# 데이터 증강
def get_transforms(image_size=224):
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(image_size),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2,
saturation=0.2, hue=0.1),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
val_transforms = transforms.Compose([
transforms.Resize(int(image_size * 1.14)),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
return train_transforms, val_transforms
아키텍처 성능 비교
| 모델 | 년도 | Top-1 정확도 | 파라미터 | FLOPs |
|---|---|---|---|---|
| LeNet-5 | 1998 | ~99% (MNIST) | 60K | - |
| AlexNet | 2012 | 56.5% | 61M | 724M |
| VGG-16 | 2014 | 71.6% | 138M | 15.5G |
| GoogLeNet | 2014 | 68.7% | 6.8M | 1.5G |
| ResNet-50 | 2015 | 75.3% | 25M | 4.1G |
| DenseNet-121 | 2017 | 74.4% | 8M | 2.9G |
| MobileNetV2 | 2018 | 71.8% | 3.4M | 300M |
| EfficientNet-B0 | 2019 | 77.1% | 5.3M | 390M |
| ConvNeXt-T | 2022 | 82.1% | 28M | 4.5G |
| ViT-B/16 | 2020 | 81.8% | 86M | 17.6G |
마무리
CNN 아키텍처는 지속적으로 진화하고 있습니다.
- LeNet (1998): 최초의 실용적 CNN, 구조의 근간 확립
- AlexNet (2012): 딥러닝 르네상스, ReLU와 Dropout 도입
- VGGNet (2014): 3x3 컨볼루션의 힘, 깊이의 중요성 입증
- ResNet (2015): 잔차 연결로 기울기 소실 해결, 수백 층 학습 가능
- DenseNet (2017): 조밀한 연결로 특징 재사용 극대화
- MobileNet (2017): 경량화로 모바일 배포 실현
- EfficientNet (2019): 복합 스케일링으로 최고 효율 달성
- ConvNeXt (2022): Transformer 인사이트를 CNN에 적용
- ViT (2020): 이미지도 시퀀스로 처리하는 새로운 패러다임
실전에서는 torchvision의 사전학습 모델에서 시작하여 전이학습으로 빠르게 목표 태스크에 적용하는 것을 권장합니다.
참고 자료
- PyTorch Vision Models
- ResNet 논문: He et al., "Deep Residual Learning for Image Recognition" (arXiv:1512.03385)
- EfficientNet 논문: Tan & Le, "EfficientNet: Rethinking Model Scaling" (arXiv:1905.11946)
- ViT 논문: Dosovitskiy et al., "An Image is Worth 16x16 Words" (arXiv:2010.11929)
- ConvNeXt 논문: Liu et al., "A ConvNet for the 2020s" (arXiv:2201.03545)
CNN Architecture Complete Guide: From LeNet to EfficientNet and Vision Transformers
CNN Architecture Complete Guide
Convolutional Neural Networks (CNNs) are the backbone of the computer vision revolution. From LeNet in 1998 to ConvNeXt and Vision Transformers in 2022, CNN architectures have evolved at a remarkable pace. This guide walks through the structural innovations behind each major CNN architecture and teaches you how to implement them in PyTorch.
1. CNN Fundamentals
Understanding Convolution Intuitively
Convolution is an operation that extracts local patterns from an image. A small filter (kernel) slides over the image to produce a feature map.
Input image (5x5) Kernel (3x3) Output feature map (3x3)
1 1 1 0 0 1 0 1 4 3 4
0 1 1 1 0 * 0 1 0 = 2 4 3
0 0 1 1 1 1 0 1 2 3 4
0 0 1 1 0
0 1 1 0 0
At each position, the output is the sum of element-wise products between the kernel and the image patch.
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
# Visualize convolution
def visualize_convolution():
image = torch.tensor([[
[1., 1., 1., 0., 0.],
[0., 1., 1., 1., 0.],
[0., 0., 1., 1., 1.],
[0., 0., 1., 1., 0.],
[0., 1., 1., 0., 0.]
]]).unsqueeze(0) # (1, 1, 5, 5)
# Edge detection kernel
edge_kernel = torch.tensor([[
[[-1., -1., -1.],
[-1., 8., -1.],
[-1., -1., -1.]]
]]) # (1, 1, 3, 3)
output = F.conv2d(image, edge_kernel, padding=1)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(image[0, 0].numpy(), cmap='gray')
axes[0].set_title('Input Image')
axes[1].imshow(edge_kernel[0, 0].numpy(), cmap='RdYlBu')
axes[1].set_title('Edge Detection Kernel')
axes[2].imshow(output[0, 0].detach().numpy(), cmap='gray')
axes[2].set_title('Output Feature Map')
plt.tight_layout()
plt.show()
Kernel, Stride, and Padding
import torch
import torch.nn as nn
# Basic Conv2d parameters
conv = nn.Conv2d(
in_channels=3, # number of input channels (RGB=3)
out_channels=64, # number of output channels (number of filters)
kernel_size=3, # kernel size (3x3)
stride=1, # stride
padding=1, # padding (same padding)
bias=True
)
# Output size formula
# H_out = floor((H_in + 2*padding - kernel_size) / stride + 1)
def calc_output_size(input_size, kernel_size, stride, padding):
return (input_size + 2 * padding - kernel_size) // stride + 1
print(calc_output_size(224, 3, 1, 1)) # 224 (same padding)
print(calc_output_size(224, 3, 2, 1)) # 112 (stride 2, halves size)
print(calc_output_size(224, 7, 2, 3)) # 112 (AlexNet first layer)
# Parameter count
# Conv2d: (kernel_h * kernel_w * in_channels + 1) * out_channels
params = (3 * 3 * 3 + 1) * 64
print(f"Conv(3->64, 3x3) parameters: {params:,}") # 1,792
Pooling (Max, Average, Global)
import torch
import torch.nn as nn
x = torch.randn(1, 64, 28, 28)
# Max Pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_max = max_pool(x) # (1, 64, 14, 14)
# Average Pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
out_avg = avg_pool(x) # (1, 64, 14, 14)
# Global Average Pooling (GAP) - collapses spatial dimensions to 1x1
gap = nn.AdaptiveAvgPool2d(1)
out_gap = gap(x) # (1, 64, 1, 1)
out_gap_flat = out_gap.flatten(1) # (1, 64)
# Adaptive Pooling - specify output size
adaptive = nn.AdaptiveAvgPool2d((7, 7))
out_adaptive = adaptive(x) # (1, 64, 7, 7) regardless of input size
print(f"Input: {x.shape}")
print(f"MaxPool: {out_max.shape}")
print(f"GAP: {out_gap_flat.shape}")
Receptive Field Calculation
def calculate_receptive_field(layers):
"""
Calculate receptive field for each layer.
layers: list of (kernel_size, stride, dilation)
"""
rf = 1
jump = 1
for k, s, d in layers:
effective_k = d * (k - 1) + 1
rf = rf + (effective_k - 1) * jump
jump = jump * s
return rf
# VGG-style (3x3 convolutions only)
vgg_layers = [
(3, 1, 1), # conv1
(3, 1, 1), # conv2
(2, 2, 1), # pool
(3, 1, 1), # conv3
(3, 1, 1), # conv4
(2, 2, 1), # pool
]
rf = calculate_receptive_field(vgg_layers)
print(f"Receptive field after 6 VGG layers: {rf}x{rf} pixels")
# Note: two 3x3 convs = same receptive field as one 5x5
# But parameters are 2*(9*C^2) vs 25*C^2 — two 3x3s use 28% fewer params
2. CNN Architecture History
LeNet-5 (1998, LeCun) — The First Practical CNN
LeNet-5, developed by Yann LeCun in 1998, was the first practical CNN, designed for handwritten digit recognition (MNIST).
Architecture: Input(32x32) -> C1(conv, 6@28x28) -> S2(pool, 6@14x14) -> C3(conv, 16@10x10) -> S4(pool, 16@5x5) -> C5(conv, 120@1x1) -> F6(fc, 84) -> Output(10)
import torch
import torch.nn as nn
class LeNet5(nn.Module):
"""LeNet-5 (with ReLU added to original)"""
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
self.features = nn.Sequential(
# C1: 1@32x32 -> 6@28x28
nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S2: 6@28x28 -> 6@14x14
nn.AvgPool2d(kernel_size=2, stride=2),
# C3: 6@14x14 -> 16@10x10
nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S4: 16@10x10 -> 16@5x5
nn.AvgPool2d(kernel_size=2, stride=2),
# C5: 16@5x5 -> 120@1x1
nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
)
self.classifier = nn.Sequential(
nn.Linear(120, 84),
nn.Tanh(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.flatten(1)
x = self.classifier(x)
return x
model = LeNet5(num_classes=10)
x = torch.randn(4, 1, 32, 32)
out = model(x)
print(f"LeNet-5 output: {out.shape}") # (4, 10)
total_params = sum(p.numel() for p in model.parameters())
print(f"LeNet-5 total parameters: {total_params:,}") # ~60,000
AlexNet (2012, Krizhevsky) — The Deep Learning Renaissance
AlexNet won the 2012 ImageNet competition with a top-5 error rate of 15.3%, obliterating the previous best of 26.2% and launching the deep learning era.
Key innovations:
- ReLU activation (6x faster training than Tanh)
- Dropout (0.5) to prevent overfitting
- Data augmentation (crops, flips)
- Local Response Normalization (LRN)
- Dual-GPU training
import torch
import torch.nn as nn
class AlexNet(nn.Module):
"""AlexNet implementation"""
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
# Layer 1: 3@224x224 -> 96@55x55
nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 96@27x27
# Layer 2: 96@27x27 -> 256@27x27
nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@13x13
# Layer 3
nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 4
nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 5: 384@13x13 -> 256@13x13
nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@6x6
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
model = AlexNet(num_classes=1000)
x = torch.randn(4, 3, 224, 224)
out = model(x)
print(f"AlexNet output: {out.shape}") # (4, 1000)
total_params = sum(p.numel() for p in model.parameters())
print(f"AlexNet parameters: {total_params:,}") # ~61M
VGGNet (2014, Simonyan) — The Power of Depth
VGGNet from Oxford's Visual Geometry Group uses exclusively 3x3 kernels throughout, allowing dramatically increased depth.
Why 3x3?
- Two 3x3 convolutions = same receptive field as one 5x5 (saves 28% of parameters)
- Three 3x3 convolutions = same receptive field as one 7x7 (saves 45% of parameters)
- More non-linear transformations increase representational capacity
import torch
import torch.nn as nn
from typing import List, Union
class VGG(nn.Module):
"""General VGG implementation"""
def __init__(self, features: nn.Module, num_classes: int = 1000, dropout: float = 0.5):
super(VGG, self).__init__()
self.features = features
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, num_classes)
)
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
layers: List[nn.Module] = []
in_channels = 3
for v in cfg:
if v == 'M':
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
else:
v = int(v)
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
cfgs = {
'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
def vgg16(num_classes=1000):
return VGG(make_layers(cfgs['vgg16'], batch_norm=True), num_classes=num_classes)
model_vgg16 = vgg16()
x = torch.randn(2, 3, 224, 224)
out = model_vgg16(x)
print(f"VGG-16 output: {out.shape}")
params = sum(p.numel() for p in model_vgg16.parameters())
print(f"VGG-16 parameters: {params:,}") # ~138M
GoogLeNet/Inception (2014, Szegedy) — Multi-scale Parallel Processing
The Inception module's key idea is to process different kernel sizes (1x1, 3x3, 5x5) in parallel, capturing features at multiple scales simultaneously.
import torch
import torch.nn as nn
class InceptionModule(nn.Module):
"""Basic Inception module"""
def __init__(self, in_channels, n1x1, n3x3_reduce, n3x3,
n5x5_reduce, n5x5, pool_proj):
super(InceptionModule, self).__init__()
# 1x1 branch
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, n1x1, kernel_size=1),
nn.BatchNorm2d(n1x1),
nn.ReLU(inplace=True)
)
# 1x1 bottleneck + 3x3
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, n3x3_reduce, kernel_size=1),
nn.BatchNorm2d(n3x3_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
nn.BatchNorm2d(n3x3),
nn.ReLU(inplace=True)
)
# 1x1 bottleneck + 5x5
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, n5x5_reduce, kernel_size=1),
nn.BatchNorm2d(n5x5_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n5x5_reduce, n5x5, kernel_size=5, padding=2),
nn.BatchNorm2d(n5x5),
nn.ReLU(inplace=True)
)
# MaxPool + 1x1
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, pool_proj, kernel_size=1),
nn.BatchNorm2d(pool_proj),
nn.ReLU(inplace=True)
)
def forward(self, x):
b1 = self.branch1(x)
b2 = self.branch2(x)
b3 = self.branch3(x)
b4 = self.branch4(x)
return torch.cat([b1, b2, b3, b4], dim=1)
module = InceptionModule(192, 64, 96, 128, 16, 32, 32)
x = torch.randn(2, 192, 28, 28)
out = module(x)
print(f"Inception output: {out.shape}") # (2, 256, 28, 28)
ResNet (2015, He) — Solving the Vanishing Gradient with Residual Connections
ResNet, introduced by He Kaiming in 2015, uses skip connections to allow gradients to flow through very deep networks, enabling training of networks with 152 layers.
Core idea: H(x) = F(x) + x
Instead of learning H(x) directly, each layer learns the residual F(x) = H(x) - x. When the optimal mapping is close to the identity, driving F(x) toward zero is much easier.
import torch
import torch.nn as nn
from typing import Optional, Type, List
class BasicBlock(nn.Module):
"""Basic block for ResNet-18/34"""
expansion = 1
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity # The residual connection
out = self.relu(out)
return out
class Bottleneck(nn.Module):
"""Bottleneck block for ResNet-50/101/152"""
expansion = 4
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(Bottleneck, self).__init__()
# 1x1 (reduce channels)
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
# 3x3 (spatial processing)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 1x1 (expand channels: out_channels * 4)
self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x):
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
"""Complete ResNet implementation"""
def __init__(self, block, layers, num_classes=1000):
super(ResNet, self).__init__()
self.in_channels = 64
# Stem
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 4 stages
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# Classifier
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
self._initialize_weights()
def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion)
)
layers = [block(self.in_channels, out_channels, stride, downsample)]
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.maxpool(self.relu(self.bn1(self.conv1(x))))
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.fc(x)
return x
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
# Test
for name, model_fn in [('ResNet-18', resnet18), ('ResNet-50', resnet50)]:
model = model_fn()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"{name}: output={out.shape}, params={params:,}")
DenseNet (2017, Huang) — Dense Connectivity
DenseNet connects each layer to every previous layer. With L layers, ResNet has L connections but DenseNet has L(L+1)/2 connections.
import torch
import torch.nn as nn
import torch.nn.functional as F
class DenseLayer(nn.Module):
"""A single DenseNet layer"""
def __init__(self, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseLayer, self).__init__()
# Bottleneck: 1x1 conv to limit channels
self.norm1 = nn.BatchNorm2d(in_channels)
self.relu1 = nn.ReLU(inplace=True)
self.conv1 = nn.Conv2d(in_channels, bn_size * growth_rate, kernel_size=1, bias=False)
# 3x3 conv
self.norm2 = nn.BatchNorm2d(bn_size * growth_rate)
self.relu2 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, padding=1, bias=False)
self.drop_rate = drop_rate
def forward(self, x):
if isinstance(x, torch.Tensor):
prev_features = [x]
else:
prev_features = x
# Concat all previous feature maps
concat_input = torch.cat(prev_features, dim=1)
out = self.conv1(self.relu1(self.norm1(concat_input)))
out = self.conv2(self.relu2(self.norm2(out)))
if self.drop_rate > 0:
out = F.dropout(out, p=self.drop_rate, training=self.training)
return out
class DenseBlock(nn.Module):
"""Dense Block composed of multiple DenseLayers"""
def __init__(self, num_layers, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseBlock, self).__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
layer = DenseLayer(
in_channels + i * growth_rate,
growth_rate, bn_size, drop_rate
)
self.layers.append(layer)
def forward(self, x):
features = [x]
for layer in self.layers:
new_feat = layer(features)
features.append(new_feat)
return torch.cat(features, dim=1)
class TransitionLayer(nn.Module):
"""Transition layer between Dense Blocks (compression + downsampling)"""
def __init__(self, in_channels, out_channels):
super(TransitionLayer, self).__init__()
self.norm = nn.BatchNorm2d(in_channels)
self.relu = nn.ReLU(inplace=True)
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
def forward(self, x):
return self.pool(self.conv(self.relu(self.norm(x))))
MobileNet (2017) — Lightweight for Edge Devices
MobileNet introduced Depthwise Separable Convolutions, drastically reducing computation while maintaining accuracy — ideal for mobile and edge deployment.
import torch
import torch.nn as nn
class DepthwiseSeparableConv(nn.Module):
"""Depthwise Separable Convolution"""
def __init__(self, in_channels, out_channels, stride=1):
super(DepthwiseSeparableConv, self).__init__()
# Depthwise: process each input channel independently
self.depthwise = nn.Sequential(
nn.Conv2d(in_channels, in_channels, kernel_size=3,
stride=stride, padding=1, groups=in_channels, bias=False),
nn.BatchNorm2d(in_channels),
nn.ReLU6(inplace=True)
)
# Pointwise: 1x1 conv to combine channels
self.pointwise = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU6(inplace=True)
)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
class InvertedResidual(nn.Module):
"""MobileNetV2 inverted residual block"""
def __init__(self, in_channels, out_channels, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
hidden_dim = int(in_channels * expand_ratio)
self.use_res_connect = (stride == 1 and in_channels == out_channels)
layers = []
if expand_ratio != 1:
layers += [
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True)
]
layers += [
nn.Conv2d(hidden_dim, hidden_dim, 3, stride=stride,
padding=1, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels)
]
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
# Parameter savings
standard_conv_params = 3 * 3 * 512 * 512 # standard convolution
dw_sep_params = (3 * 3 * 512) + (512 * 512) # depthwise separable
print(f"Standard conv: {standard_conv_params:,}")
print(f"Depthwise Separable: {dw_sep_params:,}")
print(f"Savings: {(1 - dw_sep_params/standard_conv_params):.1%}")
EfficientNet (2019, Tan) — Compound Scaling
EfficientNet proposes scaling width, depth, and resolution together using a compound coefficient, achieving the best accuracy-efficiency tradeoff at the time.
# EfficientNet scaling coefficients
efficientnet_params = {
'b0': (1.0, 1.0, 224, 0.2),
'b1': (1.0, 1.1, 240, 0.2),
'b2': (1.1, 1.2, 260, 0.3),
'b3': (1.2, 1.4, 300, 0.3),
'b4': (1.4, 1.8, 380, 0.4),
'b5': (1.6, 2.2, 456, 0.4),
'b6': (1.8, 2.6, 528, 0.5),
'b7': (2.0, 3.1, 600, 0.5),
}
# (width_coeff, depth_coeff, resolution, dropout_rate)
print("EfficientNet scaling parameters:")
for version, (w, d, r, drop) in efficientnet_params.items():
print(f" B{version[1]}: width={w:.1f}, depth={d:.1f}, res={r}, dropout={drop}")
ConvNeXt (2022, Liu) — A ConvNet for the 2020s
ConvNeXt modernizes the CNN design space by importing ideas from Vision Transformers — large kernels, LayerNorm, GELU, and inverted bottlenecks — achieving Transformer-competitive performance.
import torch
import torch.nn as nn
class ConvNeXtBlock(nn.Module):
"""ConvNeXt block"""
def __init__(self, dim, layer_scale_init_value=1e-6):
super(ConvNeXtBlock, self).__init__()
# Depthwise Conv with large kernel (7x7)
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
# LayerNorm
self.norm = nn.LayerNorm(dim, eps=1e-6)
# Inverted Bottleneck (4x channel expansion)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
# Layer Scale
self.gamma = nn.Parameter(
layer_scale_init_value * torch.ones(dim),
requires_grad=True
) if layer_scale_init_value > 0 else None
def forward(self, x):
identity = x
x = self.dwconv(x)
# (N, C, H, W) -> (N, H, W, C) for LayerNorm
x = x.permute(0, 2, 3, 1)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
# (N, H, W, C) -> (N, C, H, W)
x = x.permute(0, 3, 1, 2)
return identity + x
3. Vision Transformer (ViT)
ViT splits images into patches and applies a Transformer, treating each patch as a token — a fundamentally different paradigm from traditional CNNs.
import torch
import torch.nn as nn
class PatchEmbedding(nn.Module):
"""Convert image to patch embeddings"""
def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768):
super(PatchEmbedding, self).__init__()
self.num_patches = (image_size // patch_size) ** 2
# Single convolution performs patch extraction and embedding
self.projection = nn.Conv2d(
in_channels, embed_dim,
kernel_size=patch_size, stride=patch_size
)
def forward(self, x):
x = self.projection(x) # (B, embed_dim, H/p, W/p)
x = x.flatten(2) # (B, embed_dim, num_patches)
x = x.transpose(1, 2) # (B, num_patches, embed_dim)
return x
class MultiHeadSelfAttention(nn.Module):
"""Multi-head self-attention"""
def __init__(self, embed_dim, num_heads, dropout=0.0):
super(MultiHeadSelfAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(embed_dim, embed_dim * 3)
self.proj = nn.Linear(embed_dim, embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.dropout(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class TransformerBlock(nn.Module):
"""Transformer block"""
def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.0):
super(TransformerBlock, self).__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
mlp_hidden = int(embed_dim * mlp_ratio)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, mlp_hidden),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(mlp_hidden, embed_dim),
nn.Dropout(dropout)
)
def forward(self, x):
x = x + self.attn(self.norm1(x)) # residual
x = x + self.mlp(self.norm2(x)) # residual
return x
class VisionTransformer(nn.Module):
"""Vision Transformer (ViT)"""
def __init__(self, image_size=224, patch_size=16, in_channels=3,
num_classes=1000, embed_dim=768, depth=12, num_heads=12,
mlp_ratio=4.0, dropout=0.0):
super(VisionTransformer, self).__init__()
self.patch_embed = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)
num_patches = self.patch_embed.num_patches
# CLS token + positional embedding
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embedding = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim)
)
self.pos_dropout = nn.Dropout(dropout)
# Transformer blocks
self.blocks = nn.Sequential(*[
TransformerBlock(embed_dim, num_heads, mlp_ratio, dropout)
for _ in range(depth)
])
self.norm = nn.LayerNorm(embed_dim)
self.head = nn.Linear(embed_dim, num_classes)
self._init_weights()
def _init_weights(self):
nn.init.trunc_normal_(self.pos_embedding, std=0.02)
nn.init.trunc_normal_(self.cls_token, std=0.02)
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.zeros_(m.bias)
def forward(self, x):
B = x.shape[0]
x = self.patch_embed(x) # (B, num_patches, embed_dim)
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat([cls_tokens, x], dim=1) # prepend CLS token
x = x + self.pos_embedding
x = self.pos_dropout(x)
x = self.blocks(x)
x = self.norm(x)
cls_output = x[:, 0]
return self.head(cls_output)
def vit_base(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=768, depth=12,
num_heads=12, num_classes=num_classes
)
model = vit_base()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ViT-Base output: {out.shape}, parameters: {params:,}")
4. Object Detection: YOLO
import torch
import torch.nn as nn
class YOLOHead(nn.Module):
"""Simplified YOLO detection head"""
def __init__(self, in_channels, num_anchors, num_classes):
super(YOLOHead, self).__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
# Predict: (x, y, w, h, objectness, num_classes) * num_anchors
out_channels = num_anchors * (5 + num_classes)
self.head = nn.Sequential(
nn.Conv2d(in_channels, in_channels * 2, kernel_size=3, padding=1),
nn.BatchNorm2d(in_channels * 2),
nn.LeakyReLU(0.1),
nn.Conv2d(in_channels * 2, out_channels, kernel_size=1)
)
def forward(self, x):
out = self.head(x)
B, C, H, W = out.shape
out = out.reshape(B, self.num_anchors, 5 + self.num_classes, H, W)
out = out.permute(0, 1, 3, 4, 2).contiguous()
return out
5. Image Segmentation: U-Net
import torch
import torch.nn as nn
import torch.nn.functional as F
class DoubleConv(nn.Module):
"""U-Net double convolution block"""
def __init__(self, in_channels, out_channels):
super(DoubleConv, self).__init__()
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class UNet(nn.Module):
"""U-Net for medical image segmentation"""
def __init__(self, in_channels=1, num_classes=2, features=[64, 128, 256, 512]):
super(UNet, self).__init__()
self.encoders = nn.ModuleList()
self.decoders = nn.ModuleList()
self.pool = nn.MaxPool2d(2, 2)
# Encoder path
for feature in features:
self.encoders.append(DoubleConv(in_channels, feature))
in_channels = feature
# Bottleneck
self.bottleneck = DoubleConv(features[-1], features[-1] * 2)
# Decoder path
for feature in reversed(features):
self.decoders.append(
nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2)
)
self.decoders.append(DoubleConv(feature * 2, feature))
self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)
def forward(self, x):
skip_connections = []
# Encoder
for encoder in self.encoders:
x = encoder(x)
skip_connections.append(x)
x = self.pool(x)
x = self.bottleneck(x)
skip_connections = skip_connections[::-1]
# Decoder
for i in range(0, len(self.decoders), 2):
x = self.decoders[i](x)
skip = skip_connections[i // 2]
if x.shape != skip.shape:
x = F.interpolate(x, size=skip.shape[2:])
x = torch.cat([skip, x], dim=1) # Skip connection
x = self.decoders[i + 1](x)
return self.final_conv(x)
model = UNet(in_channels=1, num_classes=2)
x = torch.randn(4, 1, 572, 572)
out = model(x)
print(f"U-Net output: {out.shape}") # (4, 2, 572, 572)
6. Transfer Learning in Practice
Using torchvision.models
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from tqdm import tqdm
# Load pretrained models
model_resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
model_efficientnet = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)
model_vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
def feature_extraction(num_classes, freeze=True):
"""Feature extraction: freeze backbone, train only classifier"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
if freeze:
for param in model.parameters():
param.requires_grad = False
# Replace classifier
in_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(in_features, 256),
nn.ReLU(),
nn.Linear(256, num_classes)
)
for param in model.fc.parameters():
param.requires_grad = True
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable:,} / {total:,} ({trainable/total:.1%})")
return model
def fine_tuning(num_classes, unfreeze_layers=2):
"""Fine-tuning: unfreeze last few layers"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
for param in model.parameters():
param.requires_grad = False
layers = [model.layer4, model.avgpool, model.fc]
for layer in layers[-unfreeze_layers:]:
for param in layer.parameters():
param.requires_grad = True
model.fc = nn.Linear(model.fc.in_features, num_classes)
return model
def train_model(model, train_loader, val_loader, epochs=10,
learning_rate=1e-3, device='cuda'):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
backbone_params = [p for n, p in model.named_parameters()
if 'fc' not in n and p.requires_grad]
head_params = [p for n, p in model.named_parameters()
if 'fc' in n and p.requires_grad]
optimizer = optim.AdamW([
{'params': backbone_params, 'lr': learning_rate * 0.1},
{'params': head_params, 'lr': learning_rate}
], weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
best_val_acc = 0.0
for epoch in range(epochs):
model.train()
train_correct, train_total = 0, 0
for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_correct += (outputs.argmax(1) == labels).sum().item()
train_total += images.size(0)
model.eval()
val_correct, val_total = 0, 0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
val_correct += (outputs.argmax(1) == labels).sum().item()
val_total += images.size(0)
scheduler.step()
val_acc = val_correct / val_total
print(f"Epoch {epoch+1}: Train={train_correct/train_total:.4f}, Val={val_acc:.4f}")
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_model.pt')
print(f"Best validation accuracy: {best_val_acc:.4f}")
return model
# Data augmentation
from torchvision import transforms
def get_transforms(image_size=224):
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(image_size),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2,
saturation=0.2, hue=0.1),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
val_transforms = transforms.Compose([
transforms.Resize(int(image_size * 1.14)),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
return train_transforms, val_transforms
Architecture Performance Comparison
| Model | Year | Top-1 Accuracy | Parameters | FLOPs |
|---|---|---|---|---|
| LeNet-5 | 1998 | ~99% (MNIST) | 60K | - |
| AlexNet | 2012 | 56.5% | 61M | 724M |
| VGG-16 | 2014 | 71.6% | 138M | 15.5G |
| GoogLeNet | 2014 | 68.7% | 6.8M | 1.5G |
| ResNet-50 | 2015 | 75.3% | 25M | 4.1G |
| DenseNet-121 | 2017 | 74.4% | 8M | 2.9G |
| MobileNetV2 | 2018 | 71.8% | 3.4M | 300M |
| EfficientNet-B0 | 2019 | 77.1% | 5.3M | 390M |
| ConvNeXt-T | 2022 | 82.1% | 28M | 4.5G |
| ViT-B/16 | 2020 | 81.8% | 86M | 17.6G |
Conclusion
CNN architectures have undergone remarkable evolution:
- LeNet (1998): First practical CNN, establishing the foundational structure
- AlexNet (2012): Deep learning renaissance, introduced ReLU and Dropout
- VGGNet (2014): The power of 3x3 convolutions, proving depth matters
- ResNet (2015): Residual connections solved the vanishing gradient problem
- DenseNet (2017): Dense connections maximized feature reuse
- MobileNet (2017): Depthwise separable convolutions enabled mobile deployment
- EfficientNet (2019): Compound scaling achieved state-of-the-art efficiency
- ConvNeXt (2022): Modernized CNN design with Transformer-inspired principles
- ViT (2020): Treating images as sequences opened a new paradigm
In practice, start from torchvision's pretrained models and apply transfer learning to quickly adapt to your target task.
References
- PyTorch Vision Models
- ResNet paper: He et al., "Deep Residual Learning for Image Recognition" (arXiv:1512.03385)
- EfficientNet paper: Tan and Le, "EfficientNet: Rethinking Model Scaling" (arXiv:1905.11946)
- ViT paper: Dosovitskiy et al., "An Image is Worth 16x16 Words" (arXiv:2010.11929)
- ConvNeXt paper: Liu et al., "A ConvNet for the 2020s" (arXiv:2201.03545)