- Authors

- Name
- Youngju Kim
- @fjvbn20031
CNN 아키텍처 완전 정복
컨볼루션 신경망(CNN, Convolutional Neural Network)은 컴퓨터 비전 혁명의 핵심입니다. 1998년 LeNet의 등장부터 2022년 ConvNeXt와 Vision Transformer까지, CNN 아키텍처는 놀라운 속도로 발전해 왔습니다. 이 가이드에서는 주요 CNN 아키텍처의 구조적 혁신을 이해하고, PyTorch로 직접 구현하는 방법을 완전히 마스터합니다.
1. CNN 기초
합성곱 연산 직관적 이해
합성곱(Convolution)은 이미지의 지역적 패턴을 추출하는 연산입니다. 작은 필터(커널)가 이미지 위를 슬라이딩하며 특징 맵(Feature Map)을 생성합니다.
입력 이미지 (5x5) 커널 (3x3) 출력 특징 맵 (3x3)
1 1 1 0 0 1 0 1 4 3 4
0 1 1 1 0 * 0 1 0 = 2 4 3
0 0 1 1 1 1 0 1 2 3 4
0 0 1 1 0
0 1 1 0 0
각 위치에서 커널과 이미지 패치의 원소별 곱의 합이 출력 특징 맵의 값이 됩니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
# 합성곱 연산 시각화
def visualize_convolution():
# 샘플 이미지
image = torch.tensor([[
[1., 1., 1., 0., 0.],
[0., 1., 1., 1., 0.],
[0., 0., 1., 1., 1.],
[0., 0., 1., 1., 0.],
[0., 1., 1., 0., 0.]
]]).unsqueeze(0) # (1, 1, 5, 5)
# 엣지 감지 커널
edge_kernel = torch.tensor([[
[[-1., -1., -1.],
[-1., 8., -1.],
[-1., -1., -1.]]
]]) # (1, 1, 3, 3)
output = F.conv2d(image, edge_kernel, padding=1)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(image[0, 0].numpy(), cmap='gray')
axes[0].set_title('입력 이미지')
axes[1].imshow(edge_kernel[0, 0].numpy(), cmap='RdYlBu')
axes[1].set_title('엣지 감지 커널')
axes[2].imshow(output[0, 0].detach().numpy(), cmap='gray')
axes[2].set_title('출력 특징 맵')
plt.tight_layout()
plt.show()
visualize_convolution()
커널/필터, 스트라이드, 패딩
import torch
import torch.nn as nn
# 기본 Conv2d 파라미터
conv = nn.Conv2d(
in_channels=3, # 입력 채널 수 (RGB=3)
out_channels=64, # 출력 채널 수 (필터 수)
kernel_size=3, # 커널 크기 (3x3)
stride=1, # 스트라이드
padding=1, # 패딩 (same padding)
bias=True
)
# 출력 크기 계산 공식
# H_out = floor((H_in + 2*padding - kernel_size) / stride + 1)
# W_out = floor((W_in + 2*padding - kernel_size) / stride + 1)
def calc_output_size(input_size, kernel_size, stride, padding):
return (input_size + 2 * padding - kernel_size) // stride + 1
# 예시
print(calc_output_size(224, 3, 1, 1)) # 224 (same padding)
print(calc_output_size(224, 3, 2, 1)) # 112 (stride 2로 절반)
print(calc_output_size(224, 7, 2, 3)) # 112 (AlexNet 첫 레이어)
# 파라미터 수 계산
# Conv2d: (kernel_h * kernel_w * in_channels + 1) * out_channels
params = (3 * 3 * 3 + 1) * 64
print(f"Conv(3->64, 3x3) 파라미터 수: {params:,}") # 1,792
Pooling (Max, Average, Global)
import torch
import torch.nn as nn
import torch.nn.functional as F
x = torch.randn(1, 64, 28, 28)
# Max Pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_max = max_pool(x) # (1, 64, 14, 14)
# Average Pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
out_avg = avg_pool(x) # (1, 64, 14, 14)
# Global Average Pooling (GAP) - 공간 차원을 1x1로 축소
gap = nn.AdaptiveAvgPool2d(1)
out_gap = gap(x) # (1, 64, 1, 1)
out_gap_flat = out_gap.flatten(1) # (1, 64)
# Adaptive Pooling - 출력 크기 지정
adaptive = nn.AdaptiveAvgPool2d((7, 7))
out_adaptive = adaptive(x) # (1, 64, 7, 7) - 어떤 입력 크기도 OK
print(f"입력: {x.shape}")
print(f"MaxPool: {out_max.shape}")
print(f"GAP: {out_gap_flat.shape}")
Receptive Field 계산
def calculate_receptive_field(layers):
"""
각 레이어의 수용 영역 계산
layers: [(kernel_size, stride, dilation), ...]
"""
rf = 1
jump = 1
for k, s, d in layers:
effective_k = d * (k - 1) + 1
rf = rf + (effective_k - 1) * jump
jump = jump * s
return rf
# VGG 스타일 (3x3 conv만 사용)
vgg_layers = [
(3, 1, 1), # conv1
(3, 1, 1), # conv2
(2, 2, 1), # pool
(3, 1, 1), # conv3
(3, 1, 1), # conv4
(2, 2, 1), # pool
]
rf = calculate_receptive_field(vgg_layers)
print(f"VGG 6개 레이어 후 수용 영역: {rf}x{rf} 픽셀")
# 참고: 3x3 두 개 = 5x5 하나와 동일한 수용 영역
# 하지만 파라미터는 2*(9*C^2) vs 25*C^2 → 3x3 두 개가 더 효율적
2. CNN 발전사
LeNet-5 (1998, LeCun) - 최초의 실용적 CNN
LeNet-5는 Yann LeCun이 1998년 개발한 최초의 실용적 CNN으로, 손으로 쓴 숫자 인식(MNIST)에 사용되었습니다.
구조: Input(32x32) → C1(conv, 6@28x28) → S2(pool, 6@14x14) → C3(conv, 16@10x10) → S4(pool, 16@5x5) → C5(conv, 120@1x1) → F6(fc, 84) → Output(10)
import torch
import torch.nn as nn
class LeNet5(nn.Module):
"""LeNet-5 구현 (원본에 ReLU 추가)"""
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
self.features = nn.Sequential(
# C1: 1@32x32 -> 6@28x28
nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S2: 6@28x28 -> 6@14x14
nn.AvgPool2d(kernel_size=2, stride=2),
# C3: 6@14x14 -> 16@10x10
nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
# S4: 16@10x10 -> 16@5x5
nn.AvgPool2d(kernel_size=2, stride=2),
# C5: 16@5x5 -> 120@1x1
nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0),
nn.Tanh(),
)
self.classifier = nn.Sequential(
# F6: 120 -> 84
nn.Linear(120, 84),
nn.Tanh(),
# Output: 84 -> num_classes
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.flatten(1)
x = self.classifier(x)
return x
# 테스트
model = LeNet5(num_classes=10)
x = torch.randn(4, 1, 32, 32)
out = model(x)
print(f"LeNet-5 출력: {out.shape}") # (4, 10)
# 파라미터 수
total_params = sum(p.numel() for p in model.parameters())
print(f"LeNet-5 총 파라미터: {total_params:,}") # ~60,000
AlexNet (2012, Krizhevsky) - 딥러닝 르네상스의 시작
AlexNet은 2012년 ImageNet 대회에서 top-5 오류율 15.3%를 달성하며 기존 최고 성능(26.2%)을 크게 앞서 딥러닝 시대를 열었습니다.
핵심 혁신:
- ReLU 활성화 함수 도입 (Tanh 대비 6배 빠른 학습)
- Dropout (0.5)으로 과적합 방지
- Data Augmentation (크롭, 반전)
- Local Response Normalization (LRN)
- GPU 2개 병렬 학습
import torch
import torch.nn as nn
class AlexNet(nn.Module):
"""AlexNet 구현"""
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
# Layer 1: 3@224x224 -> 96@55x55
nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 96@27x27
# Layer 2: 96@27x27 -> 256@27x27
nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@13x13
# Layer 3: 256@13x13 -> 384@13x13
nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 4: 384@13x13 -> 384@13x13
nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# Layer 5: 384@13x13 -> 256@13x13
nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # 256@6x6
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
model = AlexNet(num_classes=1000)
x = torch.randn(4, 3, 224, 224)
out = model(x)
print(f"AlexNet 출력: {out.shape}") # (4, 1000)
total_params = sum(p.numel() for p in model.parameters())
print(f"AlexNet 총 파라미터: {total_params:,}") # ~61M
VGGNet (2014, Simonyan) - 깊이의 힘
VGGNet은 Oxford의 Visual Geometry Group이 개발했습니다. 핵심 인사이트는 모든 합성곱 레이어에 3x3 커널만 사용하여 깊이를 극적으로 늘린 것입니다.
왜 3x3인가?
- 3x3 두 개 = 5x5 하나의 수용 영역 (파라미터 수는 2×9C² vs 25C², 28% 절약)
- 3x3 세 개 = 7x7 하나의 수용 영역 (파라미터 수는 3×9C² vs 49C², 45% 절약)
- 더 많은 비선형 변환으로 표현력 향상
import torch
import torch.nn as nn
from typing import List, Union
class VGG(nn.Module):
"""VGG 범용 구현"""
def __init__(self, features: nn.Module, num_classes: int = 1000, dropout: float = 0.5):
super(VGG, self).__init__()
self.features = features
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout),
nn.Linear(4096, num_classes)
)
self._initialize_weights()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
"""설정 목록에서 VGG 레이어 생성"""
layers: List[nn.Module] = []
in_channels = 3
for v in cfg:
if v == 'M':
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
else:
v = int(v)
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
# VGG 설정
cfgs = {
'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
def vgg16(num_classes=1000):
return VGG(make_layers(cfgs['vgg16'], batch_norm=True), num_classes=num_classes)
def vgg19(num_classes=1000):
return VGG(make_layers(cfgs['vgg19'], batch_norm=True), num_classes=num_classes)
# 테스트
model_vgg16 = vgg16()
x = torch.randn(2, 3, 224, 224)
out = model_vgg16(x)
print(f"VGG-16 출력: {out.shape}")
params_vgg16 = sum(p.numel() for p in model_vgg16.parameters())
print(f"VGG-16 파라미터: {params_vgg16:,}") # ~138M
GoogLeNet/Inception (2014, Szegedy) - 병렬 다중 스케일 처리
Inception 모듈의 핵심은 동일한 레이어에서 서로 다른 크기의 커널(1x1, 3x3, 5x5)을 병렬로 처리하여 다양한 스케일의 특징을 동시에 추출하는 것입니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
class InceptionModule(nn.Module):
"""기본 Inception 모듈"""
def __init__(self, in_channels, n1x1, n3x3_reduce, n3x3,
n5x5_reduce, n5x5, pool_proj):
super(InceptionModule, self).__init__()
# 1x1 branch
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, n1x1, kernel_size=1),
nn.BatchNorm2d(n1x1),
nn.ReLU(inplace=True)
)
# 3x3 branch (1x1 bottleneck + 3x3)
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, n3x3_reduce, kernel_size=1),
nn.BatchNorm2d(n3x3_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
nn.BatchNorm2d(n3x3),
nn.ReLU(inplace=True)
)
# 5x5 branch (1x1 bottleneck + 5x5)
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, n5x5_reduce, kernel_size=1),
nn.BatchNorm2d(n5x5_reduce),
nn.ReLU(inplace=True),
nn.Conv2d(n5x5_reduce, n5x5, kernel_size=5, padding=2),
nn.BatchNorm2d(n5x5),
nn.ReLU(inplace=True)
)
# Pool branch (MaxPool + 1x1)
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, pool_proj, kernel_size=1),
nn.BatchNorm2d(pool_proj),
nn.ReLU(inplace=True)
)
def forward(self, x):
b1 = self.branch1(x)
b2 = self.branch2(x)
b3 = self.branch3(x)
b4 = self.branch4(x)
# 채널 차원으로 concat
return torch.cat([b1, b2, b3, b4], dim=1)
# 테스트
module = InceptionModule(192, 64, 96, 128, 16, 32, 32)
x = torch.randn(2, 192, 28, 28)
out = module(x)
print(f"Inception 출력: {out.shape}") # (2, 256, 28, 28) = 64+128+32+32
ResNet (2015, He) - 잔차 연결로 기울기 소실 해결
ResNet은 He Kaiming이 2015년 발표한 혁신적 아키텍처입니다. 잔차 연결(Skip Connection)을 통해 기울기가 깊은 레이어까지 전달되어 152층의 매우 깊은 네트워크를 학습할 수 있게 되었습니다.
핵심 아이디어: H(x) = F(x) + x
레이어가 H(x)를 직접 학습하는 대신 잔차 F(x) = H(x) - x를 학습합니다. 최적 함수가 항등 함수에 가까울 때, 잔차를 0으로 만드는 것이 더 쉽습니다.
import torch
import torch.nn as nn
from typing import Optional, Type, List
class BasicBlock(nn.Module):
"""ResNet-18/34용 기본 블록"""
expansion = 1
def __init__(self, in_channels: int, out_channels: int,
stride: int = 1, downsample: Optional[nn.Module] = None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample # 차원 맞추기 용
def forward(self, x: torch.Tensor) -> torch.Tensor:
identity = x # 잔차 연결을 위해 입력 저장
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# 차원이 다른 경우 프로젝션
if self.downsample is not None:
identity = self.downsample(x)
out += identity # 핵심: 잔차 연결
out = self.relu(out)
return out
class Bottleneck(nn.Module):
"""ResNet-50/101/152용 병목 블록"""
expansion = 4
def __init__(self, in_channels: int, out_channels: int,
stride: int = 1, downsample: Optional[nn.Module] = None):
super(Bottleneck, self).__init__()
# 1x1 conv (채널 축소)
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
# 3x3 conv (공간 처리)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 1x1 conv (채널 확장: out_channels * 4)
self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x: torch.Tensor) -> torch.Tensor:
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
"""완전한 ResNet 구현"""
def __init__(self, block: Type[nn.Module], layers: List[int],
num_classes: int = 1000):
super(ResNet, self).__init__()
self.in_channels = 64
# 스템
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 4개의 Stage
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# 분류기
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
self._initialize_weights()
def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion)
)
layers = [block(self.in_channels, out_channels, stride, downsample)]
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 스템
x = self.maxpool(self.relu(self.bn1(self.conv1(x))))
# 4개 Stage
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
# 분류
x = self.avgpool(x)
x = x.flatten(1)
x = self.fc(x)
return x
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
# 테스트
for name, model_fn in [('ResNet-18', resnet18), ('ResNet-50', resnet50)]:
model = model_fn()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"{name}: 출력={out.shape}, 파라미터={params:,}")
DenseNet (2017, Huang) - 조밀한 연결
DenseNet은 각 레이어를 이전의 모든 레이어와 연결합니다. L개의 레이어가 있을 때 ResNet은 L개의 연결이지만 DenseNet은 L(L+1)/2개의 연결이 생깁니다.
import torch
import torch.nn as nn
import torch.nn.functional as F
class DenseLayer(nn.Module):
"""DenseNet의 단일 레이어"""
def __init__(self, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseLayer, self).__init__()
# Bottleneck: 1x1 conv로 채널 수 제한
self.norm1 = nn.BatchNorm2d(in_channels)
self.relu1 = nn.ReLU(inplace=True)
self.conv1 = nn.Conv2d(in_channels, bn_size * growth_rate,
kernel_size=1, bias=False)
# 3x3 conv
self.norm2 = nn.BatchNorm2d(bn_size * growth_rate)
self.relu2 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, padding=1, bias=False)
self.drop_rate = drop_rate
def forward(self, x):
if isinstance(x, torch.Tensor):
prev_features = [x]
else:
prev_features = x
# 모든 이전 특징 맵을 concat
concat_input = torch.cat(prev_features, dim=1)
out = self.conv1(self.relu1(self.norm1(concat_input)))
out = self.conv2(self.relu2(self.norm2(out)))
if self.drop_rate > 0:
out = F.dropout(out, p=self.drop_rate, training=self.training)
return out
class DenseBlock(nn.Module):
"""여러 DenseLayer로 구성된 Dense Block"""
def __init__(self, num_layers, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
super(DenseBlock, self).__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
layer = DenseLayer(
in_channels + i * growth_rate,
growth_rate, bn_size, drop_rate
)
self.layers.append(layer)
def forward(self, x):
features = [x]
for layer in self.layers:
new_feat = layer(features)
features.append(new_feat)
return torch.cat(features, dim=1)
class TransitionLayer(nn.Module):
"""Dense Block 사이의 전환 레이어 (채널 수 조정 + 다운샘플링)"""
def __init__(self, in_channels, out_channels):
super(TransitionLayer, self).__init__()
self.norm = nn.BatchNorm2d(in_channels)
self.relu = nn.ReLU(inplace=True)
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
def forward(self, x):
return self.pool(self.conv(self.relu(self.norm(x))))
class DenseNet121(nn.Module):
"""DenseNet-121 구현"""
def __init__(self, num_classes=1000, growth_rate=32, num_init_features=64):
super(DenseNet121, self).__init__()
# DenseNet-121 구성: 6, 12, 24, 16 레이어
block_config = [6, 12, 24, 16]
compression = 0.5
# 스템
self.features = nn.Sequential(
nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(num_init_features),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
# Dense Block + Transition Layer
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = DenseBlock(num_layers, num_features, growth_rate)
self.features.add_module(f'denseblock{i+1}', block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1: # 마지막 블록 제외
out_features = int(num_features * compression)
transition = TransitionLayer(num_features, out_features)
self.features.add_module(f'transition{i+1}', transition)
num_features = out_features
# 최종 BN
self.features.add_module('norm_final', nn.BatchNorm2d(num_features))
self.features.add_module('relu_final', nn.ReLU(inplace=True))
# 분류기
self.classifier = nn.Linear(num_features, num_classes)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
def forward(self, x):
features = self.features(x)
out = self.avgpool(features)
out = out.flatten(1)
out = self.classifier(out)
return out
# 테스트
model = DenseNet121(num_classes=1000)
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"DenseNet-121 출력: {out.shape}, 파라미터: {params:,}")
MobileNet (2017) - 경량화 혁명
MobileNet은 Depthwise Separable Convolution을 도입하여 모바일/엣지 디바이스에서 실행할 수 있는 경량 CNN을 구현했습니다.
import torch
import torch.nn as nn
class DepthwiseSeparableConv(nn.Module):
"""Depthwise Separable Convolution"""
def __init__(self, in_channels, out_channels, stride=1):
super(DepthwiseSeparableConv, self).__init__()
# Depthwise: 각 입력 채널을 독립적으로 처리
self.depthwise = nn.Sequential(
nn.Conv2d(in_channels, in_channels, kernel_size=3,
stride=stride, padding=1, groups=in_channels, bias=False),
nn.BatchNorm2d(in_channels),
nn.ReLU6(inplace=True)
)
# Pointwise: 1x1 conv로 채널 결합
self.pointwise = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU6(inplace=True)
)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
class InvertedResidual(nn.Module):
"""MobileNetV2의 역 잔차 블록"""
def __init__(self, in_channels, out_channels, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
hidden_dim = int(in_channels * expand_ratio)
self.use_res_connect = (stride == 1 and in_channels == out_channels)
layers = []
if expand_ratio != 1:
# Expand
layers += [
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True)
]
layers += [
# Depthwise
nn.Conv2d(hidden_dim, hidden_dim, 3, stride=stride,
padding=1, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# Project
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels)
]
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
"""MobileNetV2 구현"""
def __init__(self, num_classes=1000, width_mult=1.0):
super(MobileNetV2, self).__init__()
# t=expand_ratio, c=out_channels, n=num_layers, s=stride
inverted_residual_settings = [
[1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
]
input_channel = int(32 * width_mult)
last_channel = int(1280 * max(1.0, width_mult))
features = [
nn.Sequential(
nn.Conv2d(3, input_channel, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(input_channel),
nn.ReLU6(inplace=True)
)
]
for t, c, n, s in inverted_residual_settings:
output_channel = int(c * width_mult)
for i in range(n):
stride = s if i == 0 else 1
features.append(
InvertedResidual(input_channel, output_channel, stride, expand_ratio=t)
)
input_channel = output_channel
features.append(nn.Sequential(
nn.Conv2d(input_channel, last_channel, 1, bias=False),
nn.BatchNorm2d(last_channel),
nn.ReLU6(inplace=True)
))
self.features = nn.Sequential(*features)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(last_channel, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.flatten(1)
x = self.classifier(x)
return x
# 파라미터 비교
standard_conv_params = 3 * 3 * 512 * 512 # 표준 합성곱
dw_sep_params = (3 * 3 * 512) + (512 * 512) # Depthwise Separable
print(f"표준 합성곱: {standard_conv_params:,}")
print(f"Depthwise Separable: {dw_sep_params:,}")
print(f"절감 비율: {(1 - dw_sep_params/standard_conv_params):.1%}")
EfficientNet (2019, Tan) - 복합 스케일링
EfficientNet은 폭(Width), 깊이(Depth), 해상도(Resolution)를 복합적으로 스케일링하는 방법을 제안했습니다.
import torch
import torch.nn as nn
import math
class MBConvBlock(nn.Module):
"""EfficientNet의 MBConv 블록 (MobileNetV2 기반)"""
def __init__(self, in_channels, out_channels, kernel_size,
stride, expand_ratio, se_ratio=0.25):
super(MBConvBlock, self).__init__()
self.stride = stride
self.use_res = (stride == 1 and in_channels == out_channels)
hidden_dim = in_channels * expand_ratio
layers = []
if expand_ratio != 1:
layers += [
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
nn.SiLU()
]
# Depthwise
layers += [
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride=stride,
padding=kernel_size//2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
nn.SiLU()
]
# Squeeze and Excitation
se_channels = max(1, int(in_channels * se_ratio))
layers += [
# Squeeze
nn.AdaptiveAvgPool2d(1),
]
self.se_reduce = nn.Conv2d(hidden_dim, se_channels, 1)
self.se_expand = nn.Conv2d(se_channels, hidden_dim, 1)
self.se_act = nn.SiLU()
# Projection
self.project = nn.Sequential(
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels, momentum=0.01, eps=1e-3)
)
self.conv = nn.Sequential(*layers[:len(layers)]) # SE 제외
self._hidden_dim = hidden_dim
def forward(self, x):
identity = x
out = x
# Expand + Depthwise
for layer in self.conv:
if isinstance(layer, nn.AdaptiveAvgPool2d):
break
out = layer(out)
# SE (Squeeze-and-Excitation)
se = out.mean([2, 3], keepdim=True)
se = self.se_act(self.se_reduce(se))
se = torch.sigmoid(self.se_expand(se))
out = out * se
# Project
out = self.project(out)
if self.use_res:
out = out + identity
return out
# EfficientNet 스케일링 계수
efficientnet_params = {
'b0': (1.0, 1.0, 224, 0.2),
'b1': (1.0, 1.1, 240, 0.2),
'b2': (1.1, 1.2, 260, 0.3),
'b3': (1.2, 1.4, 300, 0.3),
'b4': (1.4, 1.8, 380, 0.4),
'b5': (1.6, 2.2, 456, 0.4),
'b6': (1.8, 2.6, 528, 0.5),
'b7': (2.0, 3.1, 600, 0.5),
}
# (width_coeff, depth_coeff, resolution, dropout_rate)
print("EfficientNet 스케일링 파라미터:")
for version, (w, d, r, drop) in efficientnet_params.items():
print(f" B{version[1]}: 폭={w:.1f}, 깊이={d:.1f}, 해상도={r}, 드롭아웃={drop}")
ConvNeXt (2022, Liu) - Modern ConvNet
ConvNeXt는 ViT의 디자인 원칙을 CNN에 적용하여 Transformer와 대등한 성능을 달성한 "modernized" ConvNet입니다.
import torch
import torch.nn as nn
class ConvNeXtBlock(nn.Module):
"""ConvNeXt 블록"""
def __init__(self, dim, layer_scale_init_value=1e-6):
super(ConvNeXtBlock, self).__init__()
# Depthwise Conv (7x7 큰 커널)
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
# LayerNorm
self.norm = nn.LayerNorm(dim, eps=1e-6)
# Inverted Bottleneck (채널 4배 확장)
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
# Layer Scale
self.gamma = nn.Parameter(
layer_scale_init_value * torch.ones(dim),
requires_grad=True
) if layer_scale_init_value > 0 else None
def forward(self, x):
identity = x
x = self.dwconv(x)
# (N, C, H, W) -> (N, H, W, C) for LayerNorm
x = x.permute(0, 2, 3, 1)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
# (N, H, W, C) -> (N, C, H, W)
x = x.permute(0, 3, 1, 2)
return identity + x
class ConvNeXt(nn.Module):
"""ConvNeXt 구현"""
def __init__(self, in_channels=3, num_classes=1000,
depths=[3, 3, 9, 3], dims=[96, 192, 384, 768]):
super(ConvNeXt, self).__init__()
# 스템 (Patchify)
self.downsample_layers = nn.ModuleList()
stem = nn.Sequential(
nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
nn.LayerNorm(dims[0], eps=1e-6) if False else # 사용 안함
nn.GroupNorm(1, dims[0]) # 채널별 정규화
)
# 실제로는 더 단순하게
self.stem = nn.Sequential(
nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
)
# 4개 Stage
self.stages = nn.ModuleList()
self.downsamples = nn.ModuleList()
for i in range(4):
if i > 0:
self.downsamples.append(nn.Sequential(
nn.GroupNorm(1, dims[i-1]),
nn.Conv2d(dims[i-1], dims[i], kernel_size=2, stride=2)
))
else:
self.downsamples.append(nn.Identity())
stage = nn.Sequential(
*[ConvNeXtBlock(dims[i]) for _ in range(depths[i])]
)
self.stages.append(stage)
self.norm = nn.LayerNorm(dims[-1], eps=1e-6)
self.head = nn.Linear(dims[-1], num_classes)
def forward(self, x):
x = self.stem(x)
for i, (ds, stage) in enumerate(zip(self.downsamples, self.stages)):
if i > 0:
x = ds(x)
x = stage(x)
x = x.mean([-2, -1]) # Global Average Pooling
x = self.norm(x)
x = self.head(x)
return x
# ConvNeXt-T: depths=[3,3,9,3], dims=[96,192,384,768]
model = ConvNeXt(num_classes=1000, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768])
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ConvNeXt-T 출력: {out.shape}, 파라미터: {params:,}")
3. Vision Transformer (ViT)
ViT는 이미지를 패치로 분할하고 Transformer를 적용하는 획기적인 접근법입니다.
import torch
import torch.nn as nn
import math
class PatchEmbedding(nn.Module):
"""이미지를 패치 임베딩으로 변환"""
def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768):
super(PatchEmbedding, self).__init__()
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = (image_size // patch_size) ** 2
# 패치를 임베딩으로 변환 (단일 합성곱)
self.projection = nn.Conv2d(
in_channels, embed_dim,
kernel_size=patch_size, stride=patch_size
)
def forward(self, x):
# x: (B, C, H, W)
x = self.projection(x) # (B, embed_dim, H/patch, W/patch)
x = x.flatten(2) # (B, embed_dim, num_patches)
x = x.transpose(1, 2) # (B, num_patches, embed_dim)
return x
class MultiHeadSelfAttention(nn.Module):
"""다중 헤드 자기 주의"""
def __init__(self, embed_dim, num_heads, dropout=0.0):
super(MultiHeadSelfAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(embed_dim, embed_dim * 3)
self.proj = nn.Linear(embed_dim, embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, N, C = x.shape
# Q, K, V 생성
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # 각각 (B, heads, N, head_dim)
# 어텐션 가중치
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.dropout(attn)
# 가중합
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class TransformerBlock(nn.Module):
"""Transformer 블록"""
def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.0):
super(TransformerBlock, self).__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
mlp_hidden = int(embed_dim * mlp_ratio)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, mlp_hidden),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(mlp_hidden, embed_dim),
nn.Dropout(dropout)
)
def forward(self, x):
x = x + self.attn(self.norm1(x)) # 잔차 연결
x = x + self.mlp(self.norm2(x)) # 잔차 연결
return x
class VisionTransformer(nn.Module):
"""Vision Transformer (ViT) 구현"""
def __init__(self, image_size=224, patch_size=16, in_channels=3,
num_classes=1000, embed_dim=768, depth=12, num_heads=12,
mlp_ratio=4.0, dropout=0.0):
super(VisionTransformer, self).__init__()
# 패치 임베딩
self.patch_embed = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)
num_patches = self.patch_embed.num_patches
# CLS 토큰 + 위치 임베딩
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embedding = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim)
)
self.pos_dropout = nn.Dropout(dropout)
# Transformer 블록
self.blocks = nn.Sequential(*[
TransformerBlock(embed_dim, num_heads, mlp_ratio, dropout)
for _ in range(depth)
])
self.norm = nn.LayerNorm(embed_dim)
self.head = nn.Linear(embed_dim, num_classes)
self._init_weights()
def _init_weights(self):
nn.init.trunc_normal_(self.pos_embedding, std=0.02)
nn.init.trunc_normal_(self.cls_token, std=0.02)
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.zeros_(m.bias)
def forward(self, x):
B = x.shape[0]
# 패치 임베딩
x = self.patch_embed(x) # (B, num_patches, embed_dim)
# CLS 토큰 추가
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat([cls_tokens, x], dim=1) # (B, num_patches+1, embed_dim)
# 위치 임베딩 추가
x = x + self.pos_embedding
x = self.pos_dropout(x)
# Transformer 처리
x = self.blocks(x)
x = self.norm(x)
# CLS 토큰으로 분류
cls_output = x[:, 0]
logits = self.head(cls_output)
return logits
# ViT 변형들
def vit_small(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=384, depth=12,
num_heads=6, num_classes=num_classes
)
def vit_base(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=768, depth=12,
num_heads=12, num_classes=num_classes
)
def vit_large(num_classes=1000):
return VisionTransformer(
image_size=224, patch_size=16, embed_dim=1024, depth=24,
num_heads=16, num_classes=num_classes
)
# 테스트
model = vit_base()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ViT-Base 출력: {out.shape}, 파라미터: {params:,}")
4. Object Detection 아키텍처
YOLO 계열 간단 구현
import torch
import torch.nn as nn
class YOLOHead(nn.Module):
"""YOLO 감지 헤드 (단순화 버전)"""
def __init__(self, in_channels, num_anchors, num_classes):
super(YOLOHead, self).__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
# 예측: (x, y, w, h, objectness, num_classes) * num_anchors
out_channels = num_anchors * (5 + num_classes)
self.head = nn.Sequential(
nn.Conv2d(in_channels, in_channels * 2, kernel_size=3, padding=1),
nn.BatchNorm2d(in_channels * 2),
nn.LeakyReLU(0.1),
nn.Conv2d(in_channels * 2, out_channels, kernel_size=1)
)
def forward(self, x):
out = self.head(x)
B, C, H, W = out.shape
# (B, num_anchors, H, W, 5+classes)
out = out.reshape(B, self.num_anchors, 5 + self.num_classes, H, W)
out = out.permute(0, 1, 3, 4, 2).contiguous()
return out
# 간단한 YOLOv1 스타일 모델
class SimpleYOLO(nn.Module):
def __init__(self, backbone, num_classes=80, num_boxes=2):
super(SimpleYOLO, self).__init__()
self.backbone = backbone
self.num_classes = num_classes
self.num_boxes = num_boxes
# 예측 헤드: grid_size x grid_size x (B*5 + C)
self.head = nn.Sequential(
nn.AdaptiveAvgPool2d((7, 7)),
nn.Flatten(),
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 7 * 7 * (num_boxes * 5 + num_classes))
)
def forward(self, x):
features = self.backbone(x)
out = self.head(features)
out = out.reshape(-1, 7, 7, self.num_boxes * 5 + self.num_classes)
return out
5. Image Segmentation: U-Net
import torch
import torch.nn as nn
import torch.nn.functional as F
class DoubleConv(nn.Module):
"""U-Net의 이중 합성곱 블록"""
def __init__(self, in_channels, out_channels):
super(DoubleConv, self).__init__()
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class UNet(nn.Module):
"""U-Net 구현 (의료 이미지 분할)"""
def __init__(self, in_channels=1, num_classes=2, features=[64, 128, 256, 512]):
super(UNet, self).__init__()
self.encoders = nn.ModuleList()
self.decoders = nn.ModuleList()
self.pool = nn.MaxPool2d(2, 2)
# Encoder
for feature in features:
self.encoders.append(DoubleConv(in_channels, feature))
in_channels = feature
# Bottleneck
self.bottleneck = DoubleConv(features[-1], features[-1] * 2)
# Decoder
for feature in reversed(features):
self.decoders.append(
nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2)
)
self.decoders.append(DoubleConv(feature * 2, feature))
# 최종 분류 레이어
self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)
def forward(self, x):
skip_connections = []
# Encoder
for encoder in self.encoders:
x = encoder(x)
skip_connections.append(x)
x = self.pool(x)
# Bottleneck
x = self.bottleneck(x)
skip_connections = skip_connections[::-1] # 역순
# Decoder
for i in range(0, len(self.decoders), 2):
x = self.decoders[i](x) # Upsample
skip = skip_connections[i // 2]
# 크기가 다를 경우 조정
if x.shape != skip.shape:
x = F.interpolate(x, size=skip.shape[2:])
x = torch.cat([skip, x], dim=1) # Skip Connection
x = self.decoders[i + 1](x) # DoubleConv
return self.final_conv(x)
# 테스트
model = UNet(in_channels=1, num_classes=2)
x = torch.randn(4, 1, 572, 572)
out = model(x)
print(f"U-Net 출력: {out.shape}") # (4, 2, 572, 572)
params = sum(p.numel() for p in model.parameters())
print(f"U-Net 파라미터: {params:,}")
6. 전이학습 실전 가이드
torchvision.models 활용
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import torch.optim as optim
from tqdm import tqdm
# 사전학습 모델 로드
model_resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
model_efficientnet = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)
model_vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
print(f"ResNet-50 파라미터: {sum(p.numel() for p in model_resnet.parameters()):,}")
def feature_extraction(num_classes, freeze=True):
"""Feature Extraction: 백본 동결, 분류기만 학습"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
if freeze:
for param in model.parameters():
param.requires_grad = False
# 분류기 교체
in_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(in_features, 256),
nn.ReLU(),
nn.Linear(256, num_classes)
)
# 마지막 레이어만 학습 가능
for param in model.fc.parameters():
param.requires_grad = True
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"학습 가능 파라미터: {trainable:,} / {total:,} ({trainable/total:.1%})")
return model
def fine_tuning(num_classes, unfreeze_layers=2):
"""Fine-tuning: 마지막 몇 개 레이어 학습"""
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
# 전체 동결
for param in model.parameters():
param.requires_grad = False
# 마지막 N개 레이어 해제
layers = [model.layer4, model.avgpool, model.fc]
for layer in layers[-unfreeze_layers:]:
for param in layer.parameters():
param.requires_grad = True
# 분류기 교체
model.fc = nn.Linear(model.fc.in_features, num_classes)
return model
# 학습 루프
def train_model(model, train_loader, val_loader, epochs=10,
learning_rate=1e-3, device='cuda'):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
# 파라미터 그룹별 학습률 설정
backbone_params = [p for n, p in model.named_parameters()
if 'fc' not in n and p.requires_grad]
head_params = [p for n, p in model.named_parameters()
if 'fc' in n and p.requires_grad]
optimizer = optim.AdamW([
{'params': backbone_params, 'lr': learning_rate * 0.1},
{'params': head_params, 'lr': learning_rate}
], weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
best_val_acc = 0.0
history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
for epoch in range(epochs):
# 학습
model.train()
train_loss, train_correct, train_total = 0.0, 0, 0
for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
# Gradient Clipping
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss += loss.item() * images.size(0)
train_correct += (outputs.argmax(1) == labels).sum().item()
train_total += images.size(0)
# 검증
model.eval()
val_loss, val_correct, val_total = 0.0, 0, 0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
val_loss += loss.item() * images.size(0)
val_correct += (outputs.argmax(1) == labels).sum().item()
val_total += images.size(0)
scheduler.step()
train_acc = train_correct / train_total
val_acc = val_correct / val_total
epoch_train_loss = train_loss / train_total
epoch_val_loss = val_loss / val_total
history['train_loss'].append(epoch_train_loss)
history['val_loss'].append(epoch_val_loss)
history['train_acc'].append(train_acc)
history['val_acc'].append(val_acc)
print(f"에폭 {epoch+1}: Train={train_acc:.4f}, Val={val_acc:.4f}")
# 최적 모델 저장
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_model.pt')
print(f"최고 검증 정확도: {best_val_acc:.4f}")
return model, history
# 데이터 증강
def get_transforms(image_size=224):
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(image_size),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2,
saturation=0.2, hue=0.1),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
val_transforms = transforms.Compose([
transforms.Resize(int(image_size * 1.14)),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
return train_transforms, val_transforms
아키텍처 성능 비교
| 모델 | 년도 | Top-1 정확도 | 파라미터 | FLOPs |
|---|---|---|---|---|
| LeNet-5 | 1998 | ~99% (MNIST) | 60K | - |
| AlexNet | 2012 | 56.5% | 61M | 724M |
| VGG-16 | 2014 | 71.6% | 138M | 15.5G |
| GoogLeNet | 2014 | 68.7% | 6.8M | 1.5G |
| ResNet-50 | 2015 | 75.3% | 25M | 4.1G |
| DenseNet-121 | 2017 | 74.4% | 8M | 2.9G |
| MobileNetV2 | 2018 | 71.8% | 3.4M | 300M |
| EfficientNet-B0 | 2019 | 77.1% | 5.3M | 390M |
| ConvNeXt-T | 2022 | 82.1% | 28M | 4.5G |
| ViT-B/16 | 2020 | 81.8% | 86M | 17.6G |
마무리
CNN 아키텍처는 지속적으로 진화하고 있습니다.
- LeNet (1998): 최초의 실용적 CNN, 구조의 근간 확립
- AlexNet (2012): 딥러닝 르네상스, ReLU와 Dropout 도입
- VGGNet (2014): 3x3 컨볼루션의 힘, 깊이의 중요성 입증
- ResNet (2015): 잔차 연결로 기울기 소실 해결, 수백 층 학습 가능
- DenseNet (2017): 조밀한 연결로 특징 재사용 극대화
- MobileNet (2017): 경량화로 모바일 배포 실현
- EfficientNet (2019): 복합 스케일링으로 최고 효율 달성
- ConvNeXt (2022): Transformer 인사이트를 CNN에 적용
- ViT (2020): 이미지도 시퀀스로 처리하는 새로운 패러다임
실전에서는 torchvision의 사전학습 모델에서 시작하여 전이학습으로 빠르게 목표 태스크에 적용하는 것을 권장합니다.
참고 자료
- PyTorch Vision Models
- ResNet 논문: He et al., "Deep Residual Learning for Image Recognition" (arXiv:1512.03385)
- EfficientNet 논문: Tan & Le, "EfficientNet: Rethinking Model Scaling" (arXiv:1905.11946)
- ViT 논문: Dosovitskiy et al., "An Image is Worth 16x16 Words" (arXiv:2010.11929)
- ConvNeXt 논문: Liu et al., "A ConvNet for the 2020s" (arXiv:2201.03545)