[심층 강화학습] 03. PyTorch 딥러닝 기초: 텐서부터 신경망까지

PyTorch 소개

PyTorch는 Facebook(현 Meta)에서 개발한 딥러닝 프레임워크입니다. 동적 계산 그래프, 직관적인 API, 강력한 GPU 지원이 특징이며, 강화학습 연구에서 가장 널리 사용되는 프레임워크 중 하나입니다.

설치

pip install torch torchvision

텐서 (Tensor)

텐서는 PyTorch의 기본 데이터 구조로, NumPy 배열과 비슷하지만 GPU 연산과 자동 미분을 지원합니다.

텐서 생성

import torch
import numpy as np

# 다양한 방법으로 텐서 생성
# 리스트로부터
t1 = torch.tensor([1, 2, 3])
print(f"리스트로부터: {t1}")

# 특정 값으로 초기화
t_zeros = torch.zeros(3, 4)
t_ones = torch.ones(2, 3)
t_rand = torch.rand(2, 3)  # 0~1 균일 분포
t_randn = torch.randn(2, 3)  # 표준 정규 분포

print(f"영 텐서:\n{t_zeros}")
print(f"랜덤 텐서:\n{t_rand}")

# NumPy 배열로부터
np_array = np.array([[1.0, 2.0], [3.0, 4.0]])
t_from_np = torch.from_numpy(np_array)
print(f"NumPy로부터: {t_from_np}")

# 텐서를 NumPy로 변환
back_to_np = t_from_np.numpy()
print(f"다시 NumPy로: {back_to_np}")

텐서 속성

t = torch.randn(3, 4, 5)

print(f"형상 (shape): {t.shape}")
print(f"데이터 타입 (dtype): {t.dtype}")
print(f"장치 (device): {t.device}")
print(f"차원 수 (ndim): {t.ndim}")
print(f"전체 원소 수 (numel): {t.numel()}")

텐서 연산

a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

# 기본 연산
print(f"덧셈: {a + b}")
print(f"곱셈 (원소별): {a * b}")
print(f"행렬 곱: {a @ b}")
print(f"행렬 곱 (동일): {torch.matmul(a, b)}")

# 형상 변환
t = torch.arange(12)
print(f"원본: {t.shape}")

t_reshaped = t.reshape(3, 4)
print(f"reshape(3,4): {t_reshaped.shape}")

t_viewed = t.view(2, 6)
print(f"view(2,6): {t_viewed.shape}")

# 차원 추가/제거
t = torch.randn(3, 4)
t_unsqueeze = t.unsqueeze(0)  # 배치 차원 추가
print(f"unsqueeze(0): {t_unsqueeze.shape}")  # (1, 3, 4)

t_squeeze = t_unsqueeze.squeeze(0)  # 크기 1인 차원 제거
print(f"squeeze(0): {t_squeeze.shape}")  # (3, 4)

# 인덱싱과 슬라이싱
t = torch.randn(4, 5)
print(f"첫 번째 행: {t[0]}")
print(f"마지막 열: {t[:, -1]}")
print(f"2x3 부분: {t[:2, :3].shape}")

GPU 연산

PyTorch에서 GPU를 사용하면 대규모 텐서 연산을 병렬로 처리할 수 있습니다.

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# MPS (Apple Silicon) 지원 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Apple Silicon GPU 사용")

# 텐서를 GPU로 이동
t = torch.randn(1000, 1000)
t_gpu = t.to(device)
print(f"텐서 장치: {t_gpu.device}")

# GPU에서 연산
a = torch.randn(1000, 1000, device=device)
b = torch.randn(1000, 1000, device=device)
c = a @ b  # GPU에서 행렬 곱 수행

# 결과를 CPU로 다시 이동
c_cpu = c.cpu()

그래디언트와 자동 미분 (Autograd)

PyTorch의 가장 강력한 기능 중 하나는 자동 미분입니다. requires_grad=True로 설정된 텐서의 모든 연산을 추적하여 자동으로 그래디언트를 계산합니다.

기본 자동 미분

# requires_grad=True로 그래디언트 추적 활성화
x = torch.tensor([2.0, 3.0], requires_grad=True)

# 순방향 계산: y = x^2 + 3x
y = x ** 2 + 3 * x
print(f"y = {y}")

# 역전파: dy/dx = 2x + 3
z = y.sum()
z.backward()

print(f"x의 그래디언트: {x.grad}")
# x=2일 때 dy/dx = 2*2+3 = 7
# x=3일 때 dy/dx = 2*3+3 = 9
# 출력: tensor([7., 9.])

그래디언트 계산 제어

# 그래디언트 추적 비활성화 (추론 시 사용)
x = torch.randn(3, requires_grad=True)

# 방법 1: torch.no_grad()
with torch.no_grad():
    y = x * 2
    print(f"requires_grad: {y.requires_grad}")  # False

# 방법 2: detach()
z = x.detach()
print(f"detach 후 requires_grad: {z.requires_grad}")  # False

# 그래디언트 초기화 (반복 학습 시 중요)
x = torch.tensor([1.0], requires_grad=True)

for i in range(3):
    y = x ** 2
    y.backward()
    print(f"반복 {i}: grad = {x.grad}")
    x.grad.zero_()  # 그래디언트 초기화 필수

간단한 최적화 예시

# 그래디언트 하강법으로 f(x) = (x - 3)^2 최소화
x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1

for step in range(20):
    # 순방향: 손실 계산
    loss = (x - 3) ** 2

    # 역방향: 그래디언트 계산
    loss.backward()

    # 파라미터 업데이트 (그래디언트 추적 없이)
    with torch.no_grad():
        x -= learning_rate * x.grad

    # 그래디언트 초기화
    x.grad.zero_()

    if step % 5 == 0:
        print(f"스텝 {step}: x = {x.item():.4f}, loss = {loss.item():.6f}")

print(f"최종 x = {x.item():.4f} (목표: 3.0)")

신경망 구성 요소 (nn 모듈)

PyTorch의 torch.nn 모듈은 신경망 구성에 필요한 모든 빌딩 블록을 제공합니다.

기본 신경망 구성

import torch
import torch.nn as nn

class SimpleNetwork(nn.Module):
    """간단한 피드포워드 신경망"""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.network(x)

# 모델 생성 및 테스트
model = SimpleNetwork(input_size=4, hidden_size=128, output_size=2)
print(model)

# 가짜 입력으로 순방향 전파
x = torch.randn(32, 4)  # 배치 크기 32, 입력 차원 4
output = model(x)
print(f"출력 형상: {output.shape}")  # (32, 2)

# 파라미터 수 확인
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"전체 파라미터: {total_params:,}")
print(f"학습 가능 파라미터: {trainable_params:,}")

주요 레이어

# 선형 레이어 (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5)

# 합성곱 레이어
conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)

# 배치 정규화
bn = nn.BatchNorm2d(num_features=32)

# 드롭아웃
dropout = nn.Dropout(p=0.5)

# 활성화 함수
relu = nn.ReLU()
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

커스텀 레이어

자신만의 레이어를 정의할 수 있습니다. nn.Module을 상속하고 forward 메서드를 구현하면 됩니다.

class NoisyLinear(nn.Module):
    """노이즈가 추가된 선형 레이어 (탐색 강화 목적)"""
    def __init__(self, in_features, out_features, noise_std=0.1):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.noise_std = noise_std

    def forward(self, x):
        if self.training:
            noise = torch.randn_like(self.linear.weight) * self.noise_std
            weight = self.linear.weight + noise
            return x @ weight.t() + self.linear.bias
        return self.linear(x)

class DuelingHead(nn.Module):
    """Dueling DQN 구조의 출력 헤드"""
    def __init__(self, input_size, n_actions):
        super().__init__()
        self.value_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )

    def forward(self, x):
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        # Q = V + (A - mean(A))
        q_values = value + advantage - advantage.mean(dim=-1, keepdim=True)
        return q_values

# 사용 예시
head = DuelingHead(input_size=256, n_actions=4)
features = torch.randn(16, 256)
q_values = head(features)
print(f"Q 값 형상: {q_values.shape}")  # (16, 4)

손실 함수

강화학습에서 자주 사용되는 손실 함수들입니다.

# MSE 손실 (가치 함수 학습)
mse_loss = nn.MSELoss()
predicted = torch.tensor([2.5, 0.0, -1.0])
target = torch.tensor([3.0, -0.5, -1.0])
loss = mse_loss(predicted, target)
print(f"MSE 손실: {loss.item():.4f}")

# Huber 손실 (DQN에서 많이 사용, MSE보다 이상치에 강건)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predicted, target)
print(f"Huber 손실: {loss.item():.4f}")

# Cross-Entropy 손실 (정책 학습)
ce_loss = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # 3개 행동의 로짓
target_action = torch.tensor([0])  # 정답 행동
loss = ce_loss(logits, target_action)
print(f"Cross-Entropy 손실: {loss.item():.4f}")

옵티마이저

model = SimpleNetwork(4, 128, 2)

# SGD
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam (가장 많이 사용)
optimizer_adam = torch.optim.Adam(model.parameters(), lr=0.001)

# RMSprop (DQN 원논문에서 사용)
optimizer_rms = torch.optim.RMSprop(model.parameters(), lr=0.00025, alpha=0.95)

# 학습 루프 기본 구조
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    # 순방향 전파
    x = torch.randn(32, 4)
    target = torch.randn(32, 2)
    prediction = model(x)

    # 손실 계산
    loss = nn.MSELoss()(prediction, target)

    # 역방향 전파 및 파라미터 업데이트
    optimizer.zero_grad()  # 그래디언트 초기화
    loss.backward()        # 그래디언트 계산
    optimizer.step()       # 파라미터 업데이트

    if epoch % 20 == 0:
        print(f"에폭 {epoch}: 손실 = {loss.item():.4f}")

학습률 스케줄러

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# StepLR: 일정 에폭마다 학습률 감소
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

# ExponentialLR: 매 에폭 지수적 감소
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

# 사용 방법
for epoch in range(100):
    # ... 학습 코드 ...
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

CNN으로 이미지 처리

Atari 게임 등 이미지 기반 강화학습에서는 합성곱 신경망(CNN)이 핵심입니다.

class AtariCNN(nn.Module):
    """Atari 게임용 CNN (DQN 논문 기반)"""
    def __init__(self, input_channels, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        # 84x84 입력 기준 conv 출력 크기 계산
        conv_output_size = self._get_conv_output_size(input_channels)

        self.fc = nn.Sequential(
            nn.Linear(conv_output_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _get_conv_output_size(self, input_channels):
        dummy = torch.zeros(1, input_channels, 84, 84)
        output = self.conv(dummy)
        return int(output.view(1, -1).shape[1])

    def forward(self, x):
        # 입력: (batch, channels, 84, 84)
        # 0-255 범위를 0-1로 정규화
        x = x.float() / 255.0
        conv_out = self.conv(x)
        flat = conv_out.view(conv_out.size(0), -1)
        return self.fc(flat)

# 테스트
model = AtariCNN(input_channels=4, n_actions=6)
dummy_input = torch.randint(0, 256, (8, 4, 84, 84), dtype=torch.uint8)
q_values = model(dummy_input)
print(f"Q값 형상: {q_values.shape}")  # (8, 6)

TensorBoard 모니터링

학습 과정을 시각적으로 모니터링하는 것은 매우 중요합니다.

from torch.utils.tensorboard import SummaryWriter
import time

# TensorBoard 기록기 생성
writer = SummaryWriter(log_dir="runs/rl_experiment")

# 스칼라 값 기록 (보상, 손실 등)
for step in range(1000):
    fake_loss = 1.0 / (step + 1)
    fake_reward = step * 0.1
    fake_epsilon = max(0.01, 1.0 - step * 0.001)

    writer.add_scalar("training/loss", fake_loss, step)
    writer.add_scalar("training/reward", fake_reward, step)
    writer.add_scalar("training/epsilon", fake_epsilon, step)

# 여러 값을 한 그래프에
writer.add_scalars("comparison", {
    "train_loss": 0.5,
    "val_loss": 0.7,
}, global_step=0)

# 모델 구조 기록
model = SimpleNetwork(4, 128, 2)
dummy_input = torch.randn(1, 4)
writer.add_graph(model, dummy_input)

# 히스토그램 (가중치 분포 확인)
for name, param in model.named_parameters():
    writer.add_histogram(name, param.data, global_step=0)

writer.close()

TensorBoard를 실행하려면 터미널에서 다음 명령을 입력합니다.

tensorboard --logdir=runs

GAN으로 Atari 이미지 생성

GAN(Generative Adversarial Network)의 기본 개념을 이해하기 위해, Atari 게임 화면을 생성하는 간단한 GAN을 구현해 봅니다.

import torch
import torch.nn as nn

class Generator(nn.Module):
    """생성자: 랜덤 노이즈로부터 이미지 생성"""
    def __init__(self, latent_dim=100, img_channels=1, img_size=64):
        super().__init__()
        self.img_size = img_size

        self.model = nn.Sequential(
            # 입력: (batch, latent_dim)
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),

            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),

            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),

            nn.Linear(1024, img_channels * img_size * img_size),
            nn.Tanh(),
        )
        self.img_channels = img_channels

    def forward(self, z):
        img = self.model(z)
        return img.view(-1, self.img_channels, self.img_size, self.img_size)

class Discriminator(nn.Module):
    """판별자: 진짜/가짜 이미지 구별"""
    def __init__(self, img_channels=1, img_size=64):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(img_channels * img_size * img_size, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, img):
        flat = img.view(img.size(0), -1)
        return self.model(flat)

GAN 학습 루프

def train_gan(n_epochs=50, batch_size=64, latent_dim=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    generator = Generator(latent_dim=latent_dim).to(device)
    discriminator = Discriminator().to(device)

    optimizer_g = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    criterion = nn.BCELoss()

    writer = SummaryWriter("runs/gan_atari")

    for epoch in range(n_epochs):
        # 가짜 Atari 스타일 데이터 (실제로는 데이터셋 사용)
        real_images = torch.randn(batch_size, 1, 64, 64).to(device)
        real_images = (real_images + 1) / 2  # 0~1 범위

        # 레이블
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)

        # 판별자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)

        d_real = discriminator(real_images)
        d_fake = discriminator(fake_images.detach())

        d_loss_real = criterion(d_real, real_labels)
        d_loss_fake = criterion(d_fake, fake_labels)
        d_loss = d_loss_real + d_loss_fake

        optimizer_d.zero_grad()
        d_loss.backward()
        optimizer_d.step()

        # 생성자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)
        d_fake = discriminator(fake_images)

        g_loss = criterion(d_fake, real_labels)

        optimizer_g.zero_grad()
        g_loss.backward()
        optimizer_g.step()

        if epoch % 10 == 0:
            print(f"에폭 {epoch}: D 손실={d_loss.item():.4f}, G 손실={g_loss.item():.4f}")
            writer.add_scalar("GAN/d_loss", d_loss.item(), epoch)
            writer.add_scalar("GAN/g_loss", g_loss.item(), epoch)

    writer.close()
    return generator, discriminator

강화학습을 위한 PyTorch 팁

모델 저장과 불러오기

# 모델 저장
torch.save(model.state_dict(), "model.pth")

# 모델 불러오기
model = SimpleNetwork(4, 128, 2)
model.load_state_dict(torch.load("model.pth"))
model.eval()  # 추론 모드로 전환

배치 처리

# 강화학습에서 경험 배치를 텐서로 변환
experiences = [
    (np.array([1.0, 2.0, 3.0, 4.0]), 1, 1.0, np.array([1.1, 2.1, 3.1, 4.1]), False),
    (np.array([0.5, 1.5, 2.5, 3.5]), 0, 0.0, np.array([0.6, 1.6, 2.6, 3.6]), True),
]

states = torch.tensor([e[0] for e in experiences], dtype=torch.float32)
actions = torch.tensor([e[1] for e in experiences], dtype=torch.long)
rewards = torch.tensor([e[2] for e in experiences], dtype=torch.float32)
next_states = torch.tensor([e[3] for e in experiences], dtype=torch.float32)
dones = torch.tensor([e[4] for e in experiences], dtype=torch.bool)

print(f"상태 배치: {states.shape}")
print(f"행동 배치: {actions.shape}")

타겟 네트워크 복사

# DQN에서 타겟 네트워크를 주기적으로 업데이트
online_net = SimpleNetwork(4, 128, 2)
target_net = SimpleNetwork(4, 128, 2)

# 하드 업데이트: 가중치 전체 복사
target_net.load_state_dict(online_net.state_dict())

# 소프트 업데이트: 가중치를 부드럽게 보간
tau = 0.005
for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
    target_param.data.copy_(tau * online_param.data + (1 - tau) * target_param.data)

정리

이번 글에서 다룬 PyTorch 핵심 개념들입니다.

텐서: 다차원 배열로 GPU 연산과 자동 미분 지원
자동 미분: requires_grad=True와 .backward()로 그래디언트 자동 계산
nn.Module: 신경망 구성의 기본 단위, forward() 메서드 구현
손실 함수: MSE, Huber, Cross-Entropy 등 목적에 맞는 손실 선택
옵티마이저: Adam, SGD, RMSprop 등으로 파라미터 업데이트
TensorBoard: 학습 과정의 시각적 모니터링

다음 글에서는 이 PyTorch 기초를 바탕으로 Cross-Entropy 방법을 구현하여 CartPole을 풀어보겠습니다.