CNN 아키텍처 완전 정복

컨볼루션 신경망(CNN, Convolutional Neural Network)은 컴퓨터 비전 혁명의 핵심입니다. 1998년 LeNet의 등장부터 2022년 ConvNeXt와 Vision Transformer까지, CNN 아키텍처는 놀라운 속도로 발전해 왔습니다. 이 가이드에서는 주요 CNN 아키텍처의 구조적 혁신을 이해하고, PyTorch로 직접 구현하는 방법을 완전히 마스터합니다.

1. CNN 기초

합성곱 연산 직관적 이해

합성곱(Convolution)은 이미지의 지역적 패턴을 추출하는 연산입니다. 작은 필터(커널)가 이미지 위를 슬라이딩하며 특징 맵(Feature Map)을 생성합니다.

입력 이미지 (5x5)    커널 (3x3)         출력 특징 맵 (3x3)
1 1 1 0 0           1 0 1              4 3 4
0 1 1 1 0    *      0 1 0    =         2 4 3
0 0 1 1 1           1 0 1              2 3 4
0 0 1 1 0
0 1 1 0 0

각 위치에서 커널과 이미지 패치의 원소별 곱의 합이 출력 특징 맵의 값이 됩니다.

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

# 합성곱 연산 시각화
def visualize_convolution():
    # 샘플 이미지
    image = torch.tensor([[
        [1., 1., 1., 0., 0.],
        [0., 1., 1., 1., 0.],
        [0., 0., 1., 1., 1.],
        [0., 0., 1., 1., 0.],
        [0., 1., 1., 0., 0.]
    ]]).unsqueeze(0)  # (1, 1, 5, 5)

    # 엣지 감지 커널
    edge_kernel = torch.tensor([[
        [[-1., -1., -1.],
         [-1.,  8., -1.],
         [-1., -1., -1.]]
    ]])  # (1, 1, 3, 3)

    output = F.conv2d(image, edge_kernel, padding=1)

    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].imshow(image[0, 0].numpy(), cmap='gray')
    axes[0].set_title('입력 이미지')
    axes[1].imshow(edge_kernel[0, 0].numpy(), cmap='RdYlBu')
    axes[1].set_title('엣지 감지 커널')
    axes[2].imshow(output[0, 0].detach().numpy(), cmap='gray')
    axes[2].set_title('출력 특징 맵')
    plt.tight_layout()
    plt.show()

visualize_convolution()

커널/필터, 스트라이드, 패딩

import torch
import torch.nn as nn

# 기본 Conv2d 파라미터
conv = nn.Conv2d(
    in_channels=3,    # 입력 채널 수 (RGB=3)
    out_channels=64,  # 출력 채널 수 (필터 수)
    kernel_size=3,    # 커널 크기 (3x3)
    stride=1,         # 스트라이드
    padding=1,        # 패딩 (same padding)
    bias=True
)

# 출력 크기 계산 공식
# H_out = floor((H_in + 2*padding - kernel_size) / stride + 1)
# W_out = floor((W_in + 2*padding - kernel_size) / stride + 1)

def calc_output_size(input_size, kernel_size, stride, padding):
    return (input_size + 2 * padding - kernel_size) // stride + 1

# 예시
print(calc_output_size(224, 3, 1, 1))   # 224 (same padding)
print(calc_output_size(224, 3, 2, 1))   # 112 (stride 2로 절반)
print(calc_output_size(224, 7, 2, 3))   # 112 (AlexNet 첫 레이어)

# 파라미터 수 계산
# Conv2d: (kernel_h * kernel_w * in_channels + 1) * out_channels
params = (3 * 3 * 3 + 1) * 64
print(f"Conv(3->64, 3x3) 파라미터 수: {params:,}")  # 1,792

Pooling (Max, Average, Global)

import torch
import torch.nn as nn
import torch.nn.functional as F

x = torch.randn(1, 64, 28, 28)

# Max Pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_max = max_pool(x)  # (1, 64, 14, 14)

# Average Pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
out_avg = avg_pool(x)  # (1, 64, 14, 14)

# Global Average Pooling (GAP) - 공간 차원을 1x1로 축소
gap = nn.AdaptiveAvgPool2d(1)
out_gap = gap(x)                 # (1, 64, 1, 1)
out_gap_flat = out_gap.flatten(1)  # (1, 64)

# Adaptive Pooling - 출력 크기 지정
adaptive = nn.AdaptiveAvgPool2d((7, 7))
out_adaptive = adaptive(x)  # (1, 64, 7, 7) - 어떤 입력 크기도 OK

print(f"입력: {x.shape}")
print(f"MaxPool: {out_max.shape}")
print(f"GAP: {out_gap_flat.shape}")

Receptive Field 계산

def calculate_receptive_field(layers):
    """
    각 레이어의 수용 영역 계산
    layers: [(kernel_size, stride, dilation), ...]
    """
    rf = 1
    jump = 1

    for k, s, d in layers:
        effective_k = d * (k - 1) + 1
        rf = rf + (effective_k - 1) * jump
        jump = jump * s

    return rf

# VGG 스타일 (3x3 conv만 사용)
vgg_layers = [
    (3, 1, 1),  # conv1
    (3, 1, 1),  # conv2
    (2, 2, 1),  # pool
    (3, 1, 1),  # conv3
    (3, 1, 1),  # conv4
    (2, 2, 1),  # pool
]

rf = calculate_receptive_field(vgg_layers)
print(f"VGG 6개 레이어 후 수용 영역: {rf}x{rf} 픽셀")

# 참고: 3x3 두 개 = 5x5 하나와 동일한 수용 영역
# 하지만 파라미터는 2*(9*C^2) vs 25*C^2 → 3x3 두 개가 더 효율적

2. CNN 발전사

LeNet-5 (1998, LeCun) - 최초의 실용적 CNN

LeNet-5는 Yann LeCun이 1998년 개발한 최초의 실용적 CNN으로, 손으로 쓴 숫자 인식(MNIST)에 사용되었습니다.

구조: Input(32x32) → C1(conv, 6@28x28) → S2(pool, 6@14x14) → C3(conv, 16@10x10) → S4(pool, 16@5x5) → C5(conv, 120@1x1) → F6(fc, 84) → Output(10)

import torch
import torch.nn as nn

class LeNet5(nn.Module):
    """LeNet-5 구현 (원본에 ReLU 추가)"""

    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()

        self.features = nn.Sequential(
            # C1: 1@32x32 -> 6@28x28
            nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
            nn.Tanh(),
            # S2: 6@28x28 -> 6@14x14
            nn.AvgPool2d(kernel_size=2, stride=2),

            # C3: 6@14x14 -> 16@10x10
            nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
            nn.Tanh(),
            # S4: 16@10x10 -> 16@5x5
            nn.AvgPool2d(kernel_size=2, stride=2),

            # C5: 16@5x5 -> 120@1x1
            nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0),
            nn.Tanh(),
        )

        self.classifier = nn.Sequential(
            # F6: 120 -> 84
            nn.Linear(120, 84),
            nn.Tanh(),
            # Output: 84 -> num_classes
            nn.Linear(84, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.flatten(1)
        x = self.classifier(x)
        return x


# 테스트
model = LeNet5(num_classes=10)
x = torch.randn(4, 1, 32, 32)
out = model(x)
print(f"LeNet-5 출력: {out.shape}")  # (4, 10)

# 파라미터 수
total_params = sum(p.numel() for p in model.parameters())
print(f"LeNet-5 총 파라미터: {total_params:,}")  # ~60,000

AlexNet (2012, Krizhevsky) - 딥러닝 르네상스의 시작

AlexNet은 2012년 ImageNet 대회에서 top-5 오류율 15.3%를 달성하며 기존 최고 성능(26.2%)을 크게 앞서 딥러닝 시대를 열었습니다.

핵심 혁신:

ReLU 활성화 함수 도입 (Tanh 대비 6배 빠른 학습)
Dropout (0.5)으로 과적합 방지
Data Augmentation (크롭, 반전)
Local Response Normalization (LRN)
GPU 2개 병렬 학습

import torch
import torch.nn as nn

class AlexNet(nn.Module):
    """AlexNet 구현"""

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()

        self.features = nn.Sequential(
            # Layer 1: 3@224x224 -> 96@55x55
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 96@27x27

            # Layer 2: 96@27x27 -> 256@27x27
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5, alpha=1e-4, beta=0.75, k=2),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 256@13x13

            # Layer 3: 256@13x13 -> 384@13x13
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            # Layer 4: 384@13x13 -> 384@13x13
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            # Layer 5: 384@13x13 -> 256@13x13
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),  # 256@6x6
        )

        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))

        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.flatten(1)
        x = self.classifier(x)
        return x


model = AlexNet(num_classes=1000)
x = torch.randn(4, 3, 224, 224)
out = model(x)
print(f"AlexNet 출력: {out.shape}")  # (4, 1000)

total_params = sum(p.numel() for p in model.parameters())
print(f"AlexNet 총 파라미터: {total_params:,}")  # ~61M

VGGNet (2014, Simonyan) - 깊이의 힘

VGGNet은 Oxford의 Visual Geometry Group이 개발했습니다. 핵심 인사이트는 모든 합성곱 레이어에 3x3 커널만 사용하여 깊이를 극적으로 늘린 것입니다.

왜 3x3인가?

3x3 두 개 = 5x5 하나의 수용 영역 (파라미터 수는 2×9C² vs 25C², 28% 절약)
3x3 세 개 = 7x7 하나의 수용 영역 (파라미터 수는 3×9C² vs 49C², 45% 절약)
더 많은 비선형 변환으로 표현력 향상

import torch
import torch.nn as nn
from typing import List, Union

class VGG(nn.Module):
    """VGG 범용 구현"""

    def __init__(self, features: nn.Module, num_classes: int = 1000, dropout: float = 0.5):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(4096, num_classes)
        )
        self._initialize_weights()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = x.flatten(1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential:
    """설정 목록에서 VGG 레이어 생성"""
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        else:
            v = int(v)
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


# VGG 설정
cfgs = {
    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

def vgg16(num_classes=1000):
    return VGG(make_layers(cfgs['vgg16'], batch_norm=True), num_classes=num_classes)

def vgg19(num_classes=1000):
    return VGG(make_layers(cfgs['vgg19'], batch_norm=True), num_classes=num_classes)


# 테스트
model_vgg16 = vgg16()
x = torch.randn(2, 3, 224, 224)
out = model_vgg16(x)
print(f"VGG-16 출력: {out.shape}")

params_vgg16 = sum(p.numel() for p in model_vgg16.parameters())
print(f"VGG-16 파라미터: {params_vgg16:,}")  # ~138M

GoogLeNet/Inception (2014, Szegedy) - 병렬 다중 스케일 처리

Inception 모듈의 핵심은 동일한 레이어에서 서로 다른 크기의 커널(1x1, 3x3, 5x5)을 병렬로 처리하여 다양한 스케일의 특징을 동시에 추출하는 것입니다.

import torch
import torch.nn as nn
import torch.nn.functional as F

class InceptionModule(nn.Module):
    """기본 Inception 모듈"""

    def __init__(self, in_channels, n1x1, n3x3_reduce, n3x3,
                 n5x5_reduce, n5x5, pool_proj):
        super(InceptionModule, self).__init__()

        # 1x1 branch
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, n1x1, kernel_size=1),
            nn.BatchNorm2d(n1x1),
            nn.ReLU(inplace=True)
        )

        # 3x3 branch (1x1 bottleneck + 3x3)
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, n3x3_reduce, kernel_size=1),
            nn.BatchNorm2d(n3x3_reduce),
            nn.ReLU(inplace=True),
            nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1),
            nn.BatchNorm2d(n3x3),
            nn.ReLU(inplace=True)
        )

        # 5x5 branch (1x1 bottleneck + 5x5)
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, n5x5_reduce, kernel_size=1),
            nn.BatchNorm2d(n5x5_reduce),
            nn.ReLU(inplace=True),
            nn.Conv2d(n5x5_reduce, n5x5, kernel_size=5, padding=2),
            nn.BatchNorm2d(n5x5),
            nn.ReLU(inplace=True)
        )

        # Pool branch (MaxPool + 1x1)
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, pool_proj, kernel_size=1),
            nn.BatchNorm2d(pool_proj),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        b1 = self.branch1(x)
        b2 = self.branch2(x)
        b3 = self.branch3(x)
        b4 = self.branch4(x)
        # 채널 차원으로 concat
        return torch.cat([b1, b2, b3, b4], dim=1)


# 테스트
module = InceptionModule(192, 64, 96, 128, 16, 32, 32)
x = torch.randn(2, 192, 28, 28)
out = module(x)
print(f"Inception 출력: {out.shape}")  # (2, 256, 28, 28) = 64+128+32+32

ResNet (2015, He) - 잔차 연결로 기울기 소실 해결

ResNet은 He Kaiming이 2015년 발표한 혁신적 아키텍처입니다. 잔차 연결(Skip Connection)을 통해 기울기가 깊은 레이어까지 전달되어 152층의 매우 깊은 네트워크를 학습할 수 있게 되었습니다.

핵심 아이디어: H(x) = F(x) + x

레이어가 H(x)를 직접 학습하는 대신 잔차 F(x) = H(x) - x를 학습합니다. 최적 함수가 항등 함수에 가까울 때, 잔차를 0으로 만드는 것이 더 쉽습니다.

import torch
import torch.nn as nn
from typing import Optional, Type, List

class BasicBlock(nn.Module):
    """ResNet-18/34용 기본 블록"""
    expansion = 1

    def __init__(self, in_channels: int, out_channels: int,
                 stride: int = 1, downsample: Optional[nn.Module] = None):
        super(BasicBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.downsample = downsample  # 차원 맞추기 용

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        identity = x  # 잔차 연결을 위해 입력 저장

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        # 차원이 다른 경우 프로젝션
        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity  # 핵심: 잔차 연결
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    """ResNet-50/101/152용 병목 블록"""
    expansion = 4

    def __init__(self, in_channels: int, out_channels: int,
                 stride: int = 1, downsample: Optional[nn.Module] = None):
        super(Bottleneck, self).__init__()

        # 1x1 conv (채널 축소)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)

        # 3x3 conv (공간 처리)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # 1x1 conv (채널 확장: out_channels * 4)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion,
                               kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        identity = x

        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    """완전한 ResNet 구현"""

    def __init__(self, block: Type[nn.Module], layers: List[int],
                 num_classes: int = 1000):
        super(ResNet, self).__init__()
        self.in_channels = 64

        # 스템
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # 4개의 Stage
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        # 분류기
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        self._initialize_weights()

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion)
            )

        layers = [block(self.in_channels, out_channels, stride, downsample)]
        self.in_channels = out_channels * block.expansion

        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 스템
        x = self.maxpool(self.relu(self.bn1(self.conv1(x))))

        # 4개 Stage
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # 분류
        x = self.avgpool(x)
        x = x.flatten(1)
        x = self.fc(x)

        return x


def resnet18(num_classes=1000):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

def resnet34(num_classes=1000):
    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)

def resnet50(num_classes=1000):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)

def resnet101(num_classes=1000):
    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)

def resnet152(num_classes=1000):
    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)


# 테스트
for name, model_fn in [('ResNet-18', resnet18), ('ResNet-50', resnet50)]:
    model = model_fn()
    x = torch.randn(2, 3, 224, 224)
    out = model(x)
    params = sum(p.numel() for p in model.parameters())
    print(f"{name}: 출력={out.shape}, 파라미터={params:,}")

DenseNet (2017, Huang) - 조밀한 연결

DenseNet은 각 레이어를 이전의 모든 레이어와 연결합니다. L개의 레이어가 있을 때 ResNet은 L개의 연결이지만 DenseNet은 L(L+1)/2개의 연결이 생깁니다.

import torch
import torch.nn as nn
import torch.nn.functional as F

class DenseLayer(nn.Module):
    """DenseNet의 단일 레이어"""

    def __init__(self, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
        super(DenseLayer, self).__init__()
        # Bottleneck: 1x1 conv로 채널 수 제한
        self.norm1 = nn.BatchNorm2d(in_channels)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_channels, bn_size * growth_rate,
                               kernel_size=1, bias=False)

        # 3x3 conv
        self.norm2 = nn.BatchNorm2d(bn_size * growth_rate)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate,
                               kernel_size=3, padding=1, bias=False)

        self.drop_rate = drop_rate

    def forward(self, x):
        if isinstance(x, torch.Tensor):
            prev_features = [x]
        else:
            prev_features = x

        # 모든 이전 특징 맵을 concat
        concat_input = torch.cat(prev_features, dim=1)

        out = self.conv1(self.relu1(self.norm1(concat_input)))
        out = self.conv2(self.relu2(self.norm2(out)))

        if self.drop_rate > 0:
            out = F.dropout(out, p=self.drop_rate, training=self.training)

        return out


class DenseBlock(nn.Module):
    """여러 DenseLayer로 구성된 Dense Block"""

    def __init__(self, num_layers, in_channels, growth_rate, bn_size=4, drop_rate=0.0):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()

        for i in range(num_layers):
            layer = DenseLayer(
                in_channels + i * growth_rate,
                growth_rate, bn_size, drop_rate
            )
            self.layers.append(layer)

    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feat = layer(features)
            features.append(new_feat)
        return torch.cat(features, dim=1)


class TransitionLayer(nn.Module):
    """Dense Block 사이의 전환 레이어 (채널 수 조정 + 다운샘플링)"""

    def __init__(self, in_channels, out_channels):
        super(TransitionLayer, self).__init__()
        self.norm = nn.BatchNorm2d(in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        return self.pool(self.conv(self.relu(self.norm(x))))


class DenseNet121(nn.Module):
    """DenseNet-121 구현"""

    def __init__(self, num_classes=1000, growth_rate=32, num_init_features=64):
        super(DenseNet121, self).__init__()

        # DenseNet-121 구성: 6, 12, 24, 16 레이어
        block_config = [6, 12, 24, 16]
        compression = 0.5

        # 스템
        self.features = nn.Sequential(
            nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(num_init_features),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        # Dense Block + Transition Layer
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = DenseBlock(num_layers, num_features, growth_rate)
            self.features.add_module(f'denseblock{i+1}', block)
            num_features = num_features + num_layers * growth_rate

            if i != len(block_config) - 1:  # 마지막 블록 제외
                out_features = int(num_features * compression)
                transition = TransitionLayer(num_features, out_features)
                self.features.add_module(f'transition{i+1}', transition)
                num_features = out_features

        # 최종 BN
        self.features.add_module('norm_final', nn.BatchNorm2d(num_features))
        self.features.add_module('relu_final', nn.ReLU(inplace=True))

        # 분류기
        self.classifier = nn.Linear(num_features, num_classes)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        features = self.features(x)
        out = self.avgpool(features)
        out = out.flatten(1)
        out = self.classifier(out)
        return out


# 테스트
model = DenseNet121(num_classes=1000)
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"DenseNet-121 출력: {out.shape}, 파라미터: {params:,}")

MobileNet (2017) - 경량화 혁명

MobileNet은 Depthwise Separable Convolution을 도입하여 모바일/엣지 디바이스에서 실행할 수 있는 경량 CNN을 구현했습니다.

import torch
import torch.nn as nn

class DepthwiseSeparableConv(nn.Module):
    """Depthwise Separable Convolution"""

    def __init__(self, in_channels, out_channels, stride=1):
        super(DepthwiseSeparableConv, self).__init__()

        # Depthwise: 각 입력 채널을 독립적으로 처리
        self.depthwise = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3,
                      stride=stride, padding=1, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.ReLU6(inplace=True)
        )

        # Pointwise: 1x1 conv로 채널 결합
        self.pointwise = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU6(inplace=True)
        )

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x


class InvertedResidual(nn.Module):
    """MobileNetV2의 역 잔차 블록"""

    def __init__(self, in_channels, out_channels, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        hidden_dim = int(in_channels * expand_ratio)
        self.use_res_connect = (stride == 1 and in_channels == out_channels)

        layers = []
        if expand_ratio != 1:
            # Expand
            layers += [
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True)
            ]
        layers += [
            # Depthwise
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride=stride,
                      padding=1, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True),
            # Project
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        ]
        self.conv = nn.Sequential(*layers)

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    """MobileNetV2 구현"""

    def __init__(self, num_classes=1000, width_mult=1.0):
        super(MobileNetV2, self).__init__()

        # t=expand_ratio, c=out_channels, n=num_layers, s=stride
        inverted_residual_settings = [
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        input_channel = int(32 * width_mult)
        last_channel = int(1280 * max(1.0, width_mult))

        features = [
            nn.Sequential(
                nn.Conv2d(3, input_channel, 3, stride=2, padding=1, bias=False),
                nn.BatchNorm2d(input_channel),
                nn.ReLU6(inplace=True)
            )
        ]

        for t, c, n, s in inverted_residual_settings:
            output_channel = int(c * width_mult)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(
                    InvertedResidual(input_channel, output_channel, stride, expand_ratio=t)
                )
                input_channel = output_channel

        features.append(nn.Sequential(
            nn.Conv2d(input_channel, last_channel, 1, bias=False),
            nn.BatchNorm2d(last_channel),
            nn.ReLU6(inplace=True)
        ))

        self.features = nn.Sequential(*features)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(last_channel, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.flatten(1)
        x = self.classifier(x)
        return x


# 파라미터 비교
standard_conv_params = 3 * 3 * 512 * 512  # 표준 합성곱
dw_sep_params = (3 * 3 * 512) + (512 * 512)  # Depthwise Separable
print(f"표준 합성곱: {standard_conv_params:,}")
print(f"Depthwise Separable: {dw_sep_params:,}")
print(f"절감 비율: {(1 - dw_sep_params/standard_conv_params):.1%}")

EfficientNet (2019, Tan) - 복합 스케일링

EfficientNet은 폭(Width), 깊이(Depth), 해상도(Resolution)를 복합적으로 스케일링하는 방법을 제안했습니다.

import torch
import torch.nn as nn
import math

class MBConvBlock(nn.Module):
    """EfficientNet의 MBConv 블록 (MobileNetV2 기반)"""

    def __init__(self, in_channels, out_channels, kernel_size,
                 stride, expand_ratio, se_ratio=0.25):
        super(MBConvBlock, self).__init__()
        self.stride = stride
        self.use_res = (stride == 1 and in_channels == out_channels)

        hidden_dim = in_channels * expand_ratio

        layers = []
        if expand_ratio != 1:
            layers += [
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
                nn.SiLU()
            ]

        # Depthwise
        layers += [
            nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride=stride,
                      padding=kernel_size//2, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim, momentum=0.01, eps=1e-3),
            nn.SiLU()
        ]

        # Squeeze and Excitation
        se_channels = max(1, int(in_channels * se_ratio))
        layers += [
            # Squeeze
            nn.AdaptiveAvgPool2d(1),
        ]
        self.se_reduce = nn.Conv2d(hidden_dim, se_channels, 1)
        self.se_expand = nn.Conv2d(se_channels, hidden_dim, 1)
        self.se_act = nn.SiLU()

        # Projection
        self.project = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels, momentum=0.01, eps=1e-3)
        )

        self.conv = nn.Sequential(*layers[:len(layers)])  # SE 제외
        self._hidden_dim = hidden_dim

    def forward(self, x):
        identity = x

        out = x
        # Expand + Depthwise
        for layer in self.conv:
            if isinstance(layer, nn.AdaptiveAvgPool2d):
                break
            out = layer(out)

        # SE (Squeeze-and-Excitation)
        se = out.mean([2, 3], keepdim=True)
        se = self.se_act(self.se_reduce(se))
        se = torch.sigmoid(self.se_expand(se))
        out = out * se

        # Project
        out = self.project(out)

        if self.use_res:
            out = out + identity

        return out


# EfficientNet 스케일링 계수
efficientnet_params = {
    'b0': (1.0, 1.0, 224, 0.2),
    'b1': (1.0, 1.1, 240, 0.2),
    'b2': (1.1, 1.2, 260, 0.3),
    'b3': (1.2, 1.4, 300, 0.3),
    'b4': (1.4, 1.8, 380, 0.4),
    'b5': (1.6, 2.2, 456, 0.4),
    'b6': (1.8, 2.6, 528, 0.5),
    'b7': (2.0, 3.1, 600, 0.5),
}

# (width_coeff, depth_coeff, resolution, dropout_rate)
print("EfficientNet 스케일링 파라미터:")
for version, (w, d, r, drop) in efficientnet_params.items():
    print(f"  B{version[1]}: 폭={w:.1f}, 깊이={d:.1f}, 해상도={r}, 드롭아웃={drop}")

ConvNeXt (2022, Liu) - Modern ConvNet

ConvNeXt는 ViT의 디자인 원칙을 CNN에 적용하여 Transformer와 대등한 성능을 달성한 "modernized" ConvNet입니다.

import torch
import torch.nn as nn

class ConvNeXtBlock(nn.Module):
    """ConvNeXt 블록"""

    def __init__(self, dim, layer_scale_init_value=1e-6):
        super(ConvNeXtBlock, self).__init__()

        # Depthwise Conv (7x7 큰 커널)
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)

        # LayerNorm
        self.norm = nn.LayerNorm(dim, eps=1e-6)

        # Inverted Bottleneck (채널 4배 확장)
        self.pwconv1 = nn.Linear(dim, 4 * dim)
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * dim, dim)

        # Layer Scale
        self.gamma = nn.Parameter(
            layer_scale_init_value * torch.ones(dim),
            requires_grad=True
        ) if layer_scale_init_value > 0 else None

    def forward(self, x):
        identity = x

        x = self.dwconv(x)

        # (N, C, H, W) -> (N, H, W, C) for LayerNorm
        x = x.permute(0, 2, 3, 1)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)

        if self.gamma is not None:
            x = self.gamma * x

        # (N, H, W, C) -> (N, C, H, W)
        x = x.permute(0, 3, 1, 2)

        return identity + x


class ConvNeXt(nn.Module):
    """ConvNeXt 구현"""

    def __init__(self, in_channels=3, num_classes=1000,
                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768]):
        super(ConvNeXt, self).__init__()

        # 스템 (Patchify)
        self.downsample_layers = nn.ModuleList()
        stem = nn.Sequential(
            nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
            nn.LayerNorm(dims[0], eps=1e-6) if False else  # 사용 안함
            nn.GroupNorm(1, dims[0])  # 채널별 정규화
        )
        # 실제로는 더 단순하게
        self.stem = nn.Sequential(
            nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
        )

        # 4개 Stage
        self.stages = nn.ModuleList()
        self.downsamples = nn.ModuleList()

        for i in range(4):
            if i > 0:
                self.downsamples.append(nn.Sequential(
                    nn.GroupNorm(1, dims[i-1]),
                    nn.Conv2d(dims[i-1], dims[i], kernel_size=2, stride=2)
                ))
            else:
                self.downsamples.append(nn.Identity())

            stage = nn.Sequential(
                *[ConvNeXtBlock(dims[i]) for _ in range(depths[i])]
            )
            self.stages.append(stage)

        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)
        self.head = nn.Linear(dims[-1], num_classes)

    def forward(self, x):
        x = self.stem(x)
        for i, (ds, stage) in enumerate(zip(self.downsamples, self.stages)):
            if i > 0:
                x = ds(x)
            x = stage(x)

        x = x.mean([-2, -1])  # Global Average Pooling
        x = self.norm(x)
        x = self.head(x)
        return x


# ConvNeXt-T: depths=[3,3,9,3], dims=[96,192,384,768]
model = ConvNeXt(num_classes=1000, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768])
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ConvNeXt-T 출력: {out.shape}, 파라미터: {params:,}")

3. Vision Transformer (ViT)

ViT는 이미지를 패치로 분할하고 Transformer를 적용하는 획기적인 접근법입니다.

import torch
import torch.nn as nn
import math

class PatchEmbedding(nn.Module):
    """이미지를 패치 임베딩으로 변환"""

    def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768):
        super(PatchEmbedding, self).__init__()
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2

        # 패치를 임베딩으로 변환 (단일 합성곱)
        self.projection = nn.Conv2d(
            in_channels, embed_dim,
            kernel_size=patch_size, stride=patch_size
        )

    def forward(self, x):
        # x: (B, C, H, W)
        x = self.projection(x)  # (B, embed_dim, H/patch, W/patch)
        x = x.flatten(2)        # (B, embed_dim, num_patches)
        x = x.transpose(1, 2)   # (B, num_patches, embed_dim)
        return x


class MultiHeadSelfAttention(nn.Module):
    """다중 헤드 자기 주의"""

    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super(MultiHeadSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, N, C = x.shape

        # Q, K, V 생성
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)  # 각각 (B, heads, N, head_dim)

        # 어텐션 가중치
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.dropout(attn)

        # 가중합
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)

        return x


class TransformerBlock(nn.Module):
    """Transformer 블록"""

    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.0):
        super(TransformerBlock, self).__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)

        mlp_hidden = int(embed_dim * mlp_ratio)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x))  # 잔차 연결
        x = x + self.mlp(self.norm2(x))   # 잔차 연결
        return x


class VisionTransformer(nn.Module):
    """Vision Transformer (ViT) 구현"""

    def __init__(self, image_size=224, patch_size=16, in_channels=3,
                 num_classes=1000, embed_dim=768, depth=12, num_heads=12,
                 mlp_ratio=4.0, dropout=0.0):
        super(VisionTransformer, self).__init__()

        # 패치 임베딩
        self.patch_embed = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches

        # CLS 토큰 + 위치 임베딩
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(
            torch.zeros(1, num_patches + 1, embed_dim)
        )
        self.pos_dropout = nn.Dropout(dropout)

        # Transformer 블록
        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_dim, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])

        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

        self._init_weights()

    def _init_weights(self):
        nn.init.trunc_normal_(self.pos_embedding, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):
        B = x.shape[0]

        # 패치 임베딩
        x = self.patch_embed(x)  # (B, num_patches, embed_dim)

        # CLS 토큰 추가
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)  # (B, num_patches+1, embed_dim)

        # 위치 임베딩 추가
        x = x + self.pos_embedding
        x = self.pos_dropout(x)

        # Transformer 처리
        x = self.blocks(x)
        x = self.norm(x)

        # CLS 토큰으로 분류
        cls_output = x[:, 0]
        logits = self.head(cls_output)

        return logits


# ViT 변형들
def vit_small(num_classes=1000):
    return VisionTransformer(
        image_size=224, patch_size=16, embed_dim=384, depth=12,
        num_heads=6, num_classes=num_classes
    )

def vit_base(num_classes=1000):
    return VisionTransformer(
        image_size=224, patch_size=16, embed_dim=768, depth=12,
        num_heads=12, num_classes=num_classes
    )

def vit_large(num_classes=1000):
    return VisionTransformer(
        image_size=224, patch_size=16, embed_dim=1024, depth=24,
        num_heads=16, num_classes=num_classes
    )


# 테스트
model = vit_base()
x = torch.randn(2, 3, 224, 224)
out = model(x)
params = sum(p.numel() for p in model.parameters())
print(f"ViT-Base 출력: {out.shape}, 파라미터: {params:,}")

4. Object Detection 아키텍처

YOLO 계열 간단 구현

import torch
import torch.nn as nn

class YOLOHead(nn.Module):
    """YOLO 감지 헤드 (단순화 버전)"""

    def __init__(self, in_channels, num_anchors, num_classes):
        super(YOLOHead, self).__init__()
        self.num_anchors = num_anchors
        self.num_classes = num_classes

        # 예측: (x, y, w, h, objectness, num_classes) * num_anchors
        out_channels = num_anchors * (5 + num_classes)

        self.head = nn.Sequential(
            nn.Conv2d(in_channels, in_channels * 2, kernel_size=3, padding=1),
            nn.BatchNorm2d(in_channels * 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels * 2, out_channels, kernel_size=1)
        )

    def forward(self, x):
        out = self.head(x)
        B, C, H, W = out.shape
        # (B, num_anchors, H, W, 5+classes)
        out = out.reshape(B, self.num_anchors, 5 + self.num_classes, H, W)
        out = out.permute(0, 1, 3, 4, 2).contiguous()
        return out


# 간단한 YOLOv1 스타일 모델
class SimpleYOLO(nn.Module):
    def __init__(self, backbone, num_classes=80, num_boxes=2):
        super(SimpleYOLO, self).__init__()
        self.backbone = backbone
        self.num_classes = num_classes
        self.num_boxes = num_boxes

        # 예측 헤드: grid_size x grid_size x (B*5 + C)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d((7, 7)),
            nn.Flatten(),
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (num_boxes * 5 + num_classes))
        )

    def forward(self, x):
        features = self.backbone(x)
        out = self.head(features)
        out = out.reshape(-1, 7, 7, self.num_boxes * 5 + self.num_classes)
        return out

5. Image Segmentation: U-Net

import torch
import torch.nn as nn
import torch.nn.functional as F

class DoubleConv(nn.Module):
    """U-Net의 이중 합성곱 블록"""

    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)


class UNet(nn.Module):
    """U-Net 구현 (의료 이미지 분할)"""

    def __init__(self, in_channels=1, num_classes=2, features=[64, 128, 256, 512]):
        super(UNet, self).__init__()

        self.encoders = nn.ModuleList()
        self.decoders = nn.ModuleList()
        self.pool = nn.MaxPool2d(2, 2)

        # Encoder
        for feature in features:
            self.encoders.append(DoubleConv(in_channels, feature))
            in_channels = feature

        # Bottleneck
        self.bottleneck = DoubleConv(features[-1], features[-1] * 2)

        # Decoder
        for feature in reversed(features):
            self.decoders.append(
                nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2)
            )
            self.decoders.append(DoubleConv(feature * 2, feature))

        # 최종 분류 레이어
        self.final_conv = nn.Conv2d(features[0], num_classes, kernel_size=1)

    def forward(self, x):
        skip_connections = []

        # Encoder
        for encoder in self.encoders:
            x = encoder(x)
            skip_connections.append(x)
            x = self.pool(x)

        # Bottleneck
        x = self.bottleneck(x)

        skip_connections = skip_connections[::-1]  # 역순

        # Decoder
        for i in range(0, len(self.decoders), 2):
            x = self.decoders[i](x)  # Upsample
            skip = skip_connections[i // 2]

            # 크기가 다를 경우 조정
            if x.shape != skip.shape:
                x = F.interpolate(x, size=skip.shape[2:])

            x = torch.cat([skip, x], dim=1)  # Skip Connection
            x = self.decoders[i + 1](x)  # DoubleConv

        return self.final_conv(x)


# 테스트
model = UNet(in_channels=1, num_classes=2)
x = torch.randn(4, 1, 572, 572)
out = model(x)
print(f"U-Net 출력: {out.shape}")  # (4, 2, 572, 572)

params = sum(p.numel() for p in model.parameters())
print(f"U-Net 파라미터: {params:,}")

6. 전이학습 실전 가이드

torchvision.models 활용

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
import torch.optim as optim
from tqdm import tqdm

# 사전학습 모델 로드
model_resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
model_efficientnet = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.DEFAULT)
model_vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)

print(f"ResNet-50 파라미터: {sum(p.numel() for p in model_resnet.parameters()):,}")


def feature_extraction(num_classes, freeze=True):
    """Feature Extraction: 백본 동결, 분류기만 학습"""
    model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)

    if freeze:
        for param in model.parameters():
            param.requires_grad = False

    # 분류기 교체
    in_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(in_features, 256),
        nn.ReLU(),
        nn.Linear(256, num_classes)
    )

    # 마지막 레이어만 학습 가능
    for param in model.fc.parameters():
        param.requires_grad = True

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"학습 가능 파라미터: {trainable:,} / {total:,} ({trainable/total:.1%})")

    return model


def fine_tuning(num_classes, unfreeze_layers=2):
    """Fine-tuning: 마지막 몇 개 레이어 학습"""
    model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)

    # 전체 동결
    for param in model.parameters():
        param.requires_grad = False

    # 마지막 N개 레이어 해제
    layers = [model.layer4, model.avgpool, model.fc]
    for layer in layers[-unfreeze_layers:]:
        for param in layer.parameters():
            param.requires_grad = True

    # 분류기 교체
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    return model


# 학습 루프
def train_model(model, train_loader, val_loader, epochs=10,
                learning_rate=1e-3, device='cuda'):

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    # 파라미터 그룹별 학습률 설정
    backbone_params = [p for n, p in model.named_parameters()
                       if 'fc' not in n and p.requires_grad]
    head_params = [p for n, p in model.named_parameters()
                   if 'fc' in n and p.requires_grad]

    optimizer = optim.AdamW([
        {'params': backbone_params, 'lr': learning_rate * 0.1},
        {'params': head_params, 'lr': learning_rate}
    ], weight_decay=1e-4)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_val_acc = 0.0
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

    for epoch in range(epochs):
        # 학습
        model.train()
        train_loss, train_correct, train_total = 0.0, 0, 0

        for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient Clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            train_loss += loss.item() * images.size(0)
            train_correct += (outputs.argmax(1) == labels).sum().item()
            train_total += images.size(0)

        # 검증
        model.eval()
        val_loss, val_correct, val_total = 0.0, 0, 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                val_correct += (outputs.argmax(1) == labels).sum().item()
                val_total += images.size(0)

        scheduler.step()

        train_acc = train_correct / train_total
        val_acc = val_correct / val_total
        epoch_train_loss = train_loss / train_total
        epoch_val_loss = val_loss / val_total

        history['train_loss'].append(epoch_train_loss)
        history['val_loss'].append(epoch_val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)

        print(f"에폭 {epoch+1}: Train={train_acc:.4f}, Val={val_acc:.4f}")

        # 최적 모델 저장
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')

    print(f"최고 검증 정확도: {best_val_acc:.4f}")
    return model, history


# 데이터 증강
def get_transforms(image_size=224):
    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2,
                               saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    val_transforms = transforms.Compose([
        transforms.Resize(int(image_size * 1.14)),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    return train_transforms, val_transforms

아키텍처 성능 비교

모델	년도	Top-1 정확도	파라미터	FLOPs
LeNet-5	1998	~99% (MNIST)	60K	-
AlexNet	2012	56.5%	61M	724M
VGG-16	2014	71.6%	138M	15.5G
GoogLeNet	2014	68.7%	6.8M	1.5G
ResNet-50	2015	75.3%	25M	4.1G
DenseNet-121	2017	74.4%	8M	2.9G
MobileNetV2	2018	71.8%	3.4M	300M
EfficientNet-B0	2019	77.1%	5.3M	390M
ConvNeXt-T	2022	82.1%	28M	4.5G
ViT-B/16	2020	81.8%	86M	17.6G

마무리

CNN 아키텍처는 지속적으로 진화하고 있습니다.

LeNet (1998): 최초의 실용적 CNN, 구조의 근간 확립
AlexNet (2012): 딥러닝 르네상스, ReLU와 Dropout 도입
VGGNet (2014): 3x3 컨볼루션의 힘, 깊이의 중요성 입증
ResNet (2015): 잔차 연결로 기울기 소실 해결, 수백 층 학습 가능
DenseNet (2017): 조밀한 연결로 특징 재사용 극대화
MobileNet (2017): 경량화로 모바일 배포 실현
EfficientNet (2019): 복합 스케일링으로 최고 효율 달성
ConvNeXt (2022): Transformer 인사이트를 CNN에 적용
ViT (2020): 이미지도 시퀀스로 처리하는 새로운 패러다임

실전에서는 torchvision의 사전학습 모델에서 시작하여 전이학습으로 빠르게 목표 태스크에 적용하는 것을 권장합니다.

참고 자료

PyTorch Vision Models
ResNet 논문: He et al., "Deep Residual Learning for Image Recognition" (arXiv:1512.03385)
EfficientNet 논문: Tan & Le, "EfficientNet: Rethinking Model Scaling" (arXiv:1905.11946)
ViT 논문: Dosovitskiy et al., "An Image is Worth 16x16 Words" (arXiv:2010.11929)
ConvNeXt 논문: Liu et al., "A ConvNet for the 2020s" (arXiv:2201.03545)