Split View: [심층 강화학습] 03. PyTorch 딥러닝 기초: 텐서부터 신경망까지

[심층 강화학습] 03. PyTorch 딥러닝 기초: 텐서부터 신경망까지

PyTorch 소개

PyTorch는 Facebook(현 Meta)에서 개발한 딥러닝 프레임워크입니다. 동적 계산 그래프, 직관적인 API, 강력한 GPU 지원이 특징이며, 강화학습 연구에서 가장 널리 사용되는 프레임워크 중 하나입니다.

설치

pip install torch torchvision

텐서 (Tensor)

텐서는 PyTorch의 기본 데이터 구조로, NumPy 배열과 비슷하지만 GPU 연산과 자동 미분을 지원합니다.

텐서 생성

import torch
import numpy as np

# 다양한 방법으로 텐서 생성
# 리스트로부터
t1 = torch.tensor([1, 2, 3])
print(f"리스트로부터: {t1}")

# 특정 값으로 초기화
t_zeros = torch.zeros(3, 4)
t_ones = torch.ones(2, 3)
t_rand = torch.rand(2, 3)  # 0~1 균일 분포
t_randn = torch.randn(2, 3)  # 표준 정규 분포

print(f"영 텐서:\n{t_zeros}")
print(f"랜덤 텐서:\n{t_rand}")

# NumPy 배열로부터
np_array = np.array([[1.0, 2.0], [3.0, 4.0]])
t_from_np = torch.from_numpy(np_array)
print(f"NumPy로부터: {t_from_np}")

# 텐서를 NumPy로 변환
back_to_np = t_from_np.numpy()
print(f"다시 NumPy로: {back_to_np}")

텐서 속성

t = torch.randn(3, 4, 5)

print(f"형상 (shape): {t.shape}")
print(f"데이터 타입 (dtype): {t.dtype}")
print(f"장치 (device): {t.device}")
print(f"차원 수 (ndim): {t.ndim}")
print(f"전체 원소 수 (numel): {t.numel()}")

텐서 연산

a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

# 기본 연산
print(f"덧셈: {a + b}")
print(f"곱셈 (원소별): {a * b}")
print(f"행렬 곱: {a @ b}")
print(f"행렬 곱 (동일): {torch.matmul(a, b)}")

# 형상 변환
t = torch.arange(12)
print(f"원본: {t.shape}")

t_reshaped = t.reshape(3, 4)
print(f"reshape(3,4): {t_reshaped.shape}")

t_viewed = t.view(2, 6)
print(f"view(2,6): {t_viewed.shape}")

# 차원 추가/제거
t = torch.randn(3, 4)
t_unsqueeze = t.unsqueeze(0)  # 배치 차원 추가
print(f"unsqueeze(0): {t_unsqueeze.shape}")  # (1, 3, 4)

t_squeeze = t_unsqueeze.squeeze(0)  # 크기 1인 차원 제거
print(f"squeeze(0): {t_squeeze.shape}")  # (3, 4)

# 인덱싱과 슬라이싱
t = torch.randn(4, 5)
print(f"첫 번째 행: {t[0]}")
print(f"마지막 열: {t[:, -1]}")
print(f"2x3 부분: {t[:2, :3].shape}")

GPU 연산

PyTorch에서 GPU를 사용하면 대규모 텐서 연산을 병렬로 처리할 수 있습니다.

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# MPS (Apple Silicon) 지원 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Apple Silicon GPU 사용")

# 텐서를 GPU로 이동
t = torch.randn(1000, 1000)
t_gpu = t.to(device)
print(f"텐서 장치: {t_gpu.device}")

# GPU에서 연산
a = torch.randn(1000, 1000, device=device)
b = torch.randn(1000, 1000, device=device)
c = a @ b  # GPU에서 행렬 곱 수행

# 결과를 CPU로 다시 이동
c_cpu = c.cpu()

그래디언트와 자동 미분 (Autograd)

PyTorch의 가장 강력한 기능 중 하나는 자동 미분입니다. requires_grad=True로 설정된 텐서의 모든 연산을 추적하여 자동으로 그래디언트를 계산합니다.

기본 자동 미분

# requires_grad=True로 그래디언트 추적 활성화
x = torch.tensor([2.0, 3.0], requires_grad=True)

# 순방향 계산: y = x^2 + 3x
y = x ** 2 + 3 * x
print(f"y = {y}")

# 역전파: dy/dx = 2x + 3
z = y.sum()
z.backward()

print(f"x의 그래디언트: {x.grad}")
# x=2일 때 dy/dx = 2*2+3 = 7
# x=3일 때 dy/dx = 2*3+3 = 9
# 출력: tensor([7., 9.])

그래디언트 계산 제어

# 그래디언트 추적 비활성화 (추론 시 사용)
x = torch.randn(3, requires_grad=True)

# 방법 1: torch.no_grad()
with torch.no_grad():
    y = x * 2
    print(f"requires_grad: {y.requires_grad}")  # False

# 방법 2: detach()
z = x.detach()
print(f"detach 후 requires_grad: {z.requires_grad}")  # False

# 그래디언트 초기화 (반복 학습 시 중요)
x = torch.tensor([1.0], requires_grad=True)

for i in range(3):
    y = x ** 2
    y.backward()
    print(f"반복 {i}: grad = {x.grad}")
    x.grad.zero_()  # 그래디언트 초기화 필수

간단한 최적화 예시

# 그래디언트 하강법으로 f(x) = (x - 3)^2 최소화
x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1

for step in range(20):
    # 순방향: 손실 계산
    loss = (x - 3) ** 2

    # 역방향: 그래디언트 계산
    loss.backward()

    # 파라미터 업데이트 (그래디언트 추적 없이)
    with torch.no_grad():
        x -= learning_rate * x.grad

    # 그래디언트 초기화
    x.grad.zero_()

    if step % 5 == 0:
        print(f"스텝 {step}: x = {x.item():.4f}, loss = {loss.item():.6f}")

print(f"최종 x = {x.item():.4f} (목표: 3.0)")

신경망 구성 요소 (nn 모듈)

PyTorch의 torch.nn 모듈은 신경망 구성에 필요한 모든 빌딩 블록을 제공합니다.

기본 신경망 구성

import torch
import torch.nn as nn

class SimpleNetwork(nn.Module):
    """간단한 피드포워드 신경망"""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.network(x)

# 모델 생성 및 테스트
model = SimpleNetwork(input_size=4, hidden_size=128, output_size=2)
print(model)

# 가짜 입력으로 순방향 전파
x = torch.randn(32, 4)  # 배치 크기 32, 입력 차원 4
output = model(x)
print(f"출력 형상: {output.shape}")  # (32, 2)

# 파라미터 수 확인
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"전체 파라미터: {total_params:,}")
print(f"학습 가능 파라미터: {trainable_params:,}")

주요 레이어

# 선형 레이어 (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5)

# 합성곱 레이어
conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)

# 배치 정규화
bn = nn.BatchNorm2d(num_features=32)

# 드롭아웃
dropout = nn.Dropout(p=0.5)

# 활성화 함수
relu = nn.ReLU()
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

커스텀 레이어

자신만의 레이어를 정의할 수 있습니다. nn.Module을 상속하고 forward 메서드를 구현하면 됩니다.

class NoisyLinear(nn.Module):
    """노이즈가 추가된 선형 레이어 (탐색 강화 목적)"""
    def __init__(self, in_features, out_features, noise_std=0.1):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.noise_std = noise_std

    def forward(self, x):
        if self.training:
            noise = torch.randn_like(self.linear.weight) * self.noise_std
            weight = self.linear.weight + noise
            return x @ weight.t() + self.linear.bias
        return self.linear(x)

class DuelingHead(nn.Module):
    """Dueling DQN 구조의 출력 헤드"""
    def __init__(self, input_size, n_actions):
        super().__init__()
        self.value_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )

    def forward(self, x):
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        # Q = V + (A - mean(A))
        q_values = value + advantage - advantage.mean(dim=-1, keepdim=True)
        return q_values

# 사용 예시
head = DuelingHead(input_size=256, n_actions=4)
features = torch.randn(16, 256)
q_values = head(features)
print(f"Q 값 형상: {q_values.shape}")  # (16, 4)

손실 함수

강화학습에서 자주 사용되는 손실 함수들입니다.

# MSE 손실 (가치 함수 학습)
mse_loss = nn.MSELoss()
predicted = torch.tensor([2.5, 0.0, -1.0])
target = torch.tensor([3.0, -0.5, -1.0])
loss = mse_loss(predicted, target)
print(f"MSE 손실: {loss.item():.4f}")

# Huber 손실 (DQN에서 많이 사용, MSE보다 이상치에 강건)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predicted, target)
print(f"Huber 손실: {loss.item():.4f}")

# Cross-Entropy 손실 (정책 학습)
ce_loss = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # 3개 행동의 로짓
target_action = torch.tensor([0])  # 정답 행동
loss = ce_loss(logits, target_action)
print(f"Cross-Entropy 손실: {loss.item():.4f}")

옵티마이저

model = SimpleNetwork(4, 128, 2)

# SGD
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam (가장 많이 사용)
optimizer_adam = torch.optim.Adam(model.parameters(), lr=0.001)

# RMSprop (DQN 원논문에서 사용)
optimizer_rms = torch.optim.RMSprop(model.parameters(), lr=0.00025, alpha=0.95)

# 학습 루프 기본 구조
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    # 순방향 전파
    x = torch.randn(32, 4)
    target = torch.randn(32, 2)
    prediction = model(x)

    # 손실 계산
    loss = nn.MSELoss()(prediction, target)

    # 역방향 전파 및 파라미터 업데이트
    optimizer.zero_grad()  # 그래디언트 초기화
    loss.backward()        # 그래디언트 계산
    optimizer.step()       # 파라미터 업데이트

    if epoch % 20 == 0:
        print(f"에폭 {epoch}: 손실 = {loss.item():.4f}")

학습률 스케줄러

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# StepLR: 일정 에폭마다 학습률 감소
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

# ExponentialLR: 매 에폭 지수적 감소
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

# 사용 방법
for epoch in range(100):
    # ... 학습 코드 ...
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

CNN으로 이미지 처리

Atari 게임 등 이미지 기반 강화학습에서는 합성곱 신경망(CNN)이 핵심입니다.

class AtariCNN(nn.Module):
    """Atari 게임용 CNN (DQN 논문 기반)"""
    def __init__(self, input_channels, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        # 84x84 입력 기준 conv 출력 크기 계산
        conv_output_size = self._get_conv_output_size(input_channels)

        self.fc = nn.Sequential(
            nn.Linear(conv_output_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _get_conv_output_size(self, input_channels):
        dummy = torch.zeros(1, input_channels, 84, 84)
        output = self.conv(dummy)
        return int(output.view(1, -1).shape[1])

    def forward(self, x):
        # 입력: (batch, channels, 84, 84)
        # 0-255 범위를 0-1로 정규화
        x = x.float() / 255.0
        conv_out = self.conv(x)
        flat = conv_out.view(conv_out.size(0), -1)
        return self.fc(flat)

# 테스트
model = AtariCNN(input_channels=4, n_actions=6)
dummy_input = torch.randint(0, 256, (8, 4, 84, 84), dtype=torch.uint8)
q_values = model(dummy_input)
print(f"Q값 형상: {q_values.shape}")  # (8, 6)

TensorBoard 모니터링

학습 과정을 시각적으로 모니터링하는 것은 매우 중요합니다.

from torch.utils.tensorboard import SummaryWriter
import time

# TensorBoard 기록기 생성
writer = SummaryWriter(log_dir="runs/rl_experiment")

# 스칼라 값 기록 (보상, 손실 등)
for step in range(1000):
    fake_loss = 1.0 / (step + 1)
    fake_reward = step * 0.1
    fake_epsilon = max(0.01, 1.0 - step * 0.001)

    writer.add_scalar("training/loss", fake_loss, step)
    writer.add_scalar("training/reward", fake_reward, step)
    writer.add_scalar("training/epsilon", fake_epsilon, step)

# 여러 값을 한 그래프에
writer.add_scalars("comparison", {
    "train_loss": 0.5,
    "val_loss": 0.7,
}, global_step=0)

# 모델 구조 기록
model = SimpleNetwork(4, 128, 2)
dummy_input = torch.randn(1, 4)
writer.add_graph(model, dummy_input)

# 히스토그램 (가중치 분포 확인)
for name, param in model.named_parameters():
    writer.add_histogram(name, param.data, global_step=0)

writer.close()

TensorBoard를 실행하려면 터미널에서 다음 명령을 입력합니다.

tensorboard --logdir=runs

GAN으로 Atari 이미지 생성

GAN(Generative Adversarial Network)의 기본 개념을 이해하기 위해, Atari 게임 화면을 생성하는 간단한 GAN을 구현해 봅니다.

import torch
import torch.nn as nn

class Generator(nn.Module):
    """생성자: 랜덤 노이즈로부터 이미지 생성"""
    def __init__(self, latent_dim=100, img_channels=1, img_size=64):
        super().__init__()
        self.img_size = img_size

        self.model = nn.Sequential(
            # 입력: (batch, latent_dim)
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),

            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),

            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),

            nn.Linear(1024, img_channels * img_size * img_size),
            nn.Tanh(),
        )
        self.img_channels = img_channels

    def forward(self, z):
        img = self.model(z)
        return img.view(-1, self.img_channels, self.img_size, self.img_size)

class Discriminator(nn.Module):
    """판별자: 진짜/가짜 이미지 구별"""
    def __init__(self, img_channels=1, img_size=64):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(img_channels * img_size * img_size, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, img):
        flat = img.view(img.size(0), -1)
        return self.model(flat)

GAN 학습 루프

def train_gan(n_epochs=50, batch_size=64, latent_dim=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    generator = Generator(latent_dim=latent_dim).to(device)
    discriminator = Discriminator().to(device)

    optimizer_g = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    criterion = nn.BCELoss()

    writer = SummaryWriter("runs/gan_atari")

    for epoch in range(n_epochs):
        # 가짜 Atari 스타일 데이터 (실제로는 데이터셋 사용)
        real_images = torch.randn(batch_size, 1, 64, 64).to(device)
        real_images = (real_images + 1) / 2  # 0~1 범위

        # 레이블
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)

        # 판별자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)

        d_real = discriminator(real_images)
        d_fake = discriminator(fake_images.detach())

        d_loss_real = criterion(d_real, real_labels)
        d_loss_fake = criterion(d_fake, fake_labels)
        d_loss = d_loss_real + d_loss_fake

        optimizer_d.zero_grad()
        d_loss.backward()
        optimizer_d.step()

        # 생성자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)
        d_fake = discriminator(fake_images)

        g_loss = criterion(d_fake, real_labels)

        optimizer_g.zero_grad()
        g_loss.backward()
        optimizer_g.step()

        if epoch % 10 == 0:
            print(f"에폭 {epoch}: D 손실={d_loss.item():.4f}, G 손실={g_loss.item():.4f}")
            writer.add_scalar("GAN/d_loss", d_loss.item(), epoch)
            writer.add_scalar("GAN/g_loss", g_loss.item(), epoch)

    writer.close()
    return generator, discriminator

강화학습을 위한 PyTorch 팁

모델 저장과 불러오기

# 모델 저장
torch.save(model.state_dict(), "model.pth")

# 모델 불러오기
model = SimpleNetwork(4, 128, 2)
model.load_state_dict(torch.load("model.pth"))
model.eval()  # 추론 모드로 전환

배치 처리

# 강화학습에서 경험 배치를 텐서로 변환
experiences = [
    (np.array([1.0, 2.0, 3.0, 4.0]), 1, 1.0, np.array([1.1, 2.1, 3.1, 4.1]), False),
    (np.array([0.5, 1.5, 2.5, 3.5]), 0, 0.0, np.array([0.6, 1.6, 2.6, 3.6]), True),
]

states = torch.tensor([e[0] for e in experiences], dtype=torch.float32)
actions = torch.tensor([e[1] for e in experiences], dtype=torch.long)
rewards = torch.tensor([e[2] for e in experiences], dtype=torch.float32)
next_states = torch.tensor([e[3] for e in experiences], dtype=torch.float32)
dones = torch.tensor([e[4] for e in experiences], dtype=torch.bool)

print(f"상태 배치: {states.shape}")
print(f"행동 배치: {actions.shape}")

타겟 네트워크 복사

# DQN에서 타겟 네트워크를 주기적으로 업데이트
online_net = SimpleNetwork(4, 128, 2)
target_net = SimpleNetwork(4, 128, 2)

# 하드 업데이트: 가중치 전체 복사
target_net.load_state_dict(online_net.state_dict())

# 소프트 업데이트: 가중치를 부드럽게 보간
tau = 0.005
for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
    target_param.data.copy_(tau * online_param.data + (1 - tau) * target_param.data)

정리

이번 글에서 다룬 PyTorch 핵심 개념들입니다.

텐서: 다차원 배열로 GPU 연산과 자동 미분 지원
자동 미분: requires_grad=True와 .backward()로 그래디언트 자동 계산
nn.Module: 신경망 구성의 기본 단위, forward() 메서드 구현
손실 함수: MSE, Huber, Cross-Entropy 등 목적에 맞는 손실 선택
옵티마이저: Adam, SGD, RMSprop 등으로 파라미터 업데이트
TensorBoard: 학습 과정의 시각적 모니터링

다음 글에서는 이 PyTorch 기초를 바탕으로 Cross-Entropy 방법을 구현하여 CartPole을 풀어보겠습니다.

[Deep RL] 03. PyTorch Deep Learning Basics: From Tensors to Neural Networks

Introduction to PyTorch

PyTorch is a deep learning framework developed by Facebook (now Meta). It features dynamic computation graphs, an intuitive API, and powerful GPU support, making it one of the most widely used frameworks in reinforcement learning research.

Installation

pip install torch torchvision

Tensors

Tensors are the fundamental data structure of PyTorch, similar to NumPy arrays but with support for GPU computation and automatic differentiation.

Creating Tensors

import torch
import numpy as np

# 다양한 방법으로 텐서 생성
# 리스트로부터
t1 = torch.tensor([1, 2, 3])
print(f"리스트로부터: {t1}")

# 특정 값으로 초기화
t_zeros = torch.zeros(3, 4)
t_ones = torch.ones(2, 3)
t_rand = torch.rand(2, 3)  # 0~1 균일 분포
t_randn = torch.randn(2, 3)  # 표준 정규 분포

print(f"영 텐서:\n{t_zeros}")
print(f"랜덤 텐서:\n{t_rand}")

# NumPy 배열로부터
np_array = np.array([[1.0, 2.0], [3.0, 4.0]])
t_from_np = torch.from_numpy(np_array)
print(f"NumPy로부터: {t_from_np}")

# 텐서를 NumPy로 변환
back_to_np = t_from_np.numpy()
print(f"다시 NumPy로: {back_to_np}")

Tensor Properties

t = torch.randn(3, 4, 5)

print(f"형상 (shape): {t.shape}")
print(f"데이터 타입 (dtype): {t.dtype}")
print(f"장치 (device): {t.device}")
print(f"차원 수 (ndim): {t.ndim}")
print(f"전체 원소 수 (numel): {t.numel()}")

Tensor Operations

a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

# 기본 연산
print(f"덧셈: {a + b}")
print(f"곱셈 (원소별): {a * b}")
print(f"행렬 곱: {a @ b}")
print(f"행렬 곱 (동일): {torch.matmul(a, b)}")

# 형상 변환
t = torch.arange(12)
print(f"원본: {t.shape}")

t_reshaped = t.reshape(3, 4)
print(f"reshape(3,4): {t_reshaped.shape}")

t_viewed = t.view(2, 6)
print(f"view(2,6): {t_viewed.shape}")

# 차원 추가/제거
t = torch.randn(3, 4)
t_unsqueeze = t.unsqueeze(0)  # 배치 차원 추가
print(f"unsqueeze(0): {t_unsqueeze.shape}")  # (1, 3, 4)

t_squeeze = t_unsqueeze.squeeze(0)  # 크기 1인 차원 제거
print(f"squeeze(0): {t_squeeze.shape}")  # (3, 4)

# 인덱싱과 슬라이싱
t = torch.randn(4, 5)
print(f"첫 번째 행: {t[0]}")
print(f"마지막 열: {t[:, -1]}")
print(f"2x3 부분: {t[:2, :3].shape}")

GPU Computation

Using GPU in PyTorch allows large-scale tensor operations to be processed in parallel.

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# MPS (Apple Silicon) 지원 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Apple Silicon GPU 사용")

# 텐서를 GPU로 이동
t = torch.randn(1000, 1000)
t_gpu = t.to(device)
print(f"텐서 장치: {t_gpu.device}")

# GPU에서 연산
a = torch.randn(1000, 1000, device=device)
b = torch.randn(1000, 1000, device=device)
c = a @ b  # GPU에서 행렬 곱 수행

# 결과를 CPU로 다시 이동
c_cpu = c.cpu()

Gradients and Automatic Differentiation (Autograd)

One of PyTorch's most powerful features is automatic differentiation. It tracks all operations on tensors with requires_grad=True and automatically computes gradients.

Basic Automatic Differentiation

# requires_grad=True로 그래디언트 추적 활성화
x = torch.tensor([2.0, 3.0], requires_grad=True)

# 순방향 계산: y = x^2 + 3x
y = x ** 2 + 3 * x
print(f"y = {y}")

# 역전파: dy/dx = 2x + 3
z = y.sum()
z.backward()

print(f"x의 그래디언트: {x.grad}")
# x=2일 때 dy/dx = 2*2+3 = 7
# x=3일 때 dy/dx = 2*3+3 = 9
# 출력: tensor([7., 9.])

Gradient Computation Control

# 그래디언트 추적 비활성화 (추론 시 사용)
x = torch.randn(3, requires_grad=True)

# 방법 1: torch.no_grad()
with torch.no_grad():
    y = x * 2
    print(f"requires_grad: {y.requires_grad}")  # False

# 방법 2: detach()
z = x.detach()
print(f"detach 후 requires_grad: {z.requires_grad}")  # False

# 그래디언트 초기화 (반복 학습 시 중요)
x = torch.tensor([1.0], requires_grad=True)

for i in range(3):
    y = x ** 2
    y.backward()
    print(f"반복 {i}: grad = {x.grad}")
    x.grad.zero_()  # 그래디언트 초기화 필수

Simple Optimization Example

# 그래디언트 하강법으로 f(x) = (x - 3)^2 최소화
x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1

for step in range(20):
    # 순방향: 손실 계산
    loss = (x - 3) ** 2

    # 역방향: 그래디언트 계산
    loss.backward()

    # 파라미터 업데이트 (그래디언트 추적 없이)
    with torch.no_grad():
        x -= learning_rate * x.grad

    # 그래디언트 초기화
    x.grad.zero_()

    if step % 5 == 0:
        print(f"스텝 {step}: x = {x.item():.4f}, loss = {loss.item():.6f}")

print(f"최종 x = {x.item():.4f} (목표: 3.0)")

Neural Network Components (nn Module)

PyTorch's torch.nn module provides all the building blocks needed for neural network construction.

Basic Neural Network Construction

import torch
import torch.nn as nn

class SimpleNetwork(nn.Module):
    """간단한 피드포워드 신경망"""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.network(x)

# 모델 생성 및 테스트
model = SimpleNetwork(input_size=4, hidden_size=128, output_size=2)
print(model)

# 가짜 입력으로 순방향 전파
x = torch.randn(32, 4)  # 배치 크기 32, 입력 차원 4
output = model(x)
print(f"출력 형상: {output.shape}")  # (32, 2)

# 파라미터 수 확인
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"전체 파라미터: {total_params:,}")
print(f"학습 가능 파라미터: {trainable_params:,}")

Key Layers

# 선형 레이어 (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5)

# 합성곱 레이어
conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)

# 배치 정규화
bn = nn.BatchNorm2d(num_features=32)

# 드롭아웃
dropout = nn.Dropout(p=0.5)

# 활성화 함수
relu = nn.ReLU()
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

Custom Layers

You can define your own layers by inheriting from nn.Module and implementing the forward method.

class NoisyLinear(nn.Module):
    """노이즈가 추가된 선형 레이어 (탐색 강화 목적)"""
    def __init__(self, in_features, out_features, noise_std=0.1):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.noise_std = noise_std

    def forward(self, x):
        if self.training:
            noise = torch.randn_like(self.linear.weight) * self.noise_std
            weight = self.linear.weight + noise
            return x @ weight.t() + self.linear.bias
        return self.linear(x)

class DuelingHead(nn.Module):
    """Dueling DQN 구조의 출력 헤드"""
    def __init__(self, input_size, n_actions):
        super().__init__()
        self.value_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )

    def forward(self, x):
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        # Q = V + (A - mean(A))
        q_values = value + advantage - advantage.mean(dim=-1, keepdim=True)
        return q_values

# 사용 예시
head = DuelingHead(input_size=256, n_actions=4)
features = torch.randn(16, 256)
q_values = head(features)
print(f"Q 값 형상: {q_values.shape}")  # (16, 4)

Loss Functions

Loss functions commonly used in reinforcement learning:

# MSE 손실 (가치 함수 학습)
mse_loss = nn.MSELoss()
predicted = torch.tensor([2.5, 0.0, -1.0])
target = torch.tensor([3.0, -0.5, -1.0])
loss = mse_loss(predicted, target)
print(f"MSE 손실: {loss.item():.4f}")

# Huber 손실 (DQN에서 많이 사용, MSE보다 이상치에 강건)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predicted, target)
print(f"Huber 손실: {loss.item():.4f}")

# Cross-Entropy 손실 (정책 학습)
ce_loss = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # 3개 행동의 로짓
target_action = torch.tensor([0])  # 정답 행동
loss = ce_loss(logits, target_action)
print(f"Cross-Entropy 손실: {loss.item():.4f}")

Optimizers

model = SimpleNetwork(4, 128, 2)

# SGD
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam (가장 많이 사용)
optimizer_adam = torch.optim.Adam(model.parameters(), lr=0.001)

# RMSprop (DQN 원논문에서 사용)
optimizer_rms = torch.optim.RMSprop(model.parameters(), lr=0.00025, alpha=0.95)

# 학습 루프 기본 구조
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    # 순방향 전파
    x = torch.randn(32, 4)
    target = torch.randn(32, 2)
    prediction = model(x)

    # 손실 계산
    loss = nn.MSELoss()(prediction, target)

    # 역방향 전파 및 파라미터 업데이트
    optimizer.zero_grad()  # 그래디언트 초기화
    loss.backward()        # 그래디언트 계산
    optimizer.step()       # 파라미터 업데이트

    if epoch % 20 == 0:
        print(f"에폭 {epoch}: 손실 = {loss.item():.4f}")

Learning Rate Schedulers

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# StepLR: 일정 에폭마다 학습률 감소
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

# ExponentialLR: 매 에폭 지수적 감소
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

# 사용 방법
for epoch in range(100):
    # ... 학습 코드 ...
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

Image Processing with CNN

Convolutional Neural Networks (CNNs) are essential for image-based reinforcement learning such as Atari games.

class AtariCNN(nn.Module):
    """Atari 게임용 CNN (DQN 논문 기반)"""
    def __init__(self, input_channels, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        # 84x84 입력 기준 conv 출력 크기 계산
        conv_output_size = self._get_conv_output_size(input_channels)

        self.fc = nn.Sequential(
            nn.Linear(conv_output_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _get_conv_output_size(self, input_channels):
        dummy = torch.zeros(1, input_channels, 84, 84)
        output = self.conv(dummy)
        return int(output.view(1, -1).shape[1])

    def forward(self, x):
        # 입력: (batch, channels, 84, 84)
        # 0-255 범위를 0-1로 정규화
        x = x.float() / 255.0
        conv_out = self.conv(x)
        flat = conv_out.view(conv_out.size(0), -1)
        return self.fc(flat)

# 테스트
model = AtariCNN(input_channels=4, n_actions=6)
dummy_input = torch.randint(0, 256, (8, 4, 84, 84), dtype=torch.uint8)
q_values = model(dummy_input)
print(f"Q값 형상: {q_values.shape}")  # (8, 6)

TensorBoard Monitoring

Visually monitoring the training process is very important.

from torch.utils.tensorboard import SummaryWriter
import time

# TensorBoard 기록기 생성
writer = SummaryWriter(log_dir="runs/rl_experiment")

# 스칼라 값 기록 (보상, 손실 등)
for step in range(1000):
    fake_loss = 1.0 / (step + 1)
    fake_reward = step * 0.1
    fake_epsilon = max(0.01, 1.0 - step * 0.001)

    writer.add_scalar("training/loss", fake_loss, step)
    writer.add_scalar("training/reward", fake_reward, step)
    writer.add_scalar("training/epsilon", fake_epsilon, step)

# 여러 값을 한 그래프에
writer.add_scalars("comparison", {
    "train_loss": 0.5,
    "val_loss": 0.7,
}, global_step=0)

# 모델 구조 기록
model = SimpleNetwork(4, 128, 2)
dummy_input = torch.randn(1, 4)
writer.add_graph(model, dummy_input)

# 히스토그램 (가중치 분포 확인)
for name, param in model.named_parameters():
    writer.add_histogram(name, param.data, global_step=0)

writer.close()

To run TensorBoard, enter the following command in the terminal:

tensorboard --logdir=runs

Generating Atari Images with GAN

To understand the basic concepts of GANs (Generative Adversarial Networks), let us implement a simple GAN that generates Atari game screens.

import torch
import torch.nn as nn

class Generator(nn.Module):
    """생성자: 랜덤 노이즈로부터 이미지 생성"""
    def __init__(self, latent_dim=100, img_channels=1, img_size=64):
        super().__init__()
        self.img_size = img_size

        self.model = nn.Sequential(
            # 입력: (batch, latent_dim)
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),

            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),

            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),

            nn.Linear(1024, img_channels * img_size * img_size),
            nn.Tanh(),
        )
        self.img_channels = img_channels

    def forward(self, z):
        img = self.model(z)
        return img.view(-1, self.img_channels, self.img_size, self.img_size)

class Discriminator(nn.Module):
    """판별자: 진짜/가짜 이미지 구별"""
    def __init__(self, img_channels=1, img_size=64):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(img_channels * img_size * img_size, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, img):
        flat = img.view(img.size(0), -1)
        return self.model(flat)

GAN Training Loop

def train_gan(n_epochs=50, batch_size=64, latent_dim=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    generator = Generator(latent_dim=latent_dim).to(device)
    discriminator = Discriminator().to(device)

    optimizer_g = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    criterion = nn.BCELoss()

    writer = SummaryWriter("runs/gan_atari")

    for epoch in range(n_epochs):
        # 가짜 Atari 스타일 데이터 (실제로는 데이터셋 사용)
        real_images = torch.randn(batch_size, 1, 64, 64).to(device)
        real_images = (real_images + 1) / 2  # 0~1 범위

        # 레이블
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)

        # 판별자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)

        d_real = discriminator(real_images)
        d_fake = discriminator(fake_images.detach())

        d_loss_real = criterion(d_real, real_labels)
        d_loss_fake = criterion(d_fake, fake_labels)
        d_loss = d_loss_real + d_loss_fake

        optimizer_d.zero_grad()
        d_loss.backward()
        optimizer_d.step()

        # 생성자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)
        d_fake = discriminator(fake_images)

        g_loss = criterion(d_fake, real_labels)

        optimizer_g.zero_grad()
        g_loss.backward()
        optimizer_g.step()

        if epoch % 10 == 0:
            print(f"에폭 {epoch}: D 손실={d_loss.item():.4f}, G 손실={g_loss.item():.4f}")
            writer.add_scalar("GAN/d_loss", d_loss.item(), epoch)
            writer.add_scalar("GAN/g_loss", g_loss.item(), epoch)

    writer.close()
    return generator, discriminator

PyTorch Tips for Reinforcement Learning

Saving and Loading Models

# 모델 저장
torch.save(model.state_dict(), "model.pth")

# 모델 불러오기
model = SimpleNetwork(4, 128, 2)
model.load_state_dict(torch.load("model.pth"))
model.eval()  # 추론 모드로 전환

Batch Processing

# 강화학습에서 경험 배치를 텐서로 변환
experiences = [
    (np.array([1.0, 2.0, 3.0, 4.0]), 1, 1.0, np.array([1.1, 2.1, 3.1, 4.1]), False),
    (np.array([0.5, 1.5, 2.5, 3.5]), 0, 0.0, np.array([0.6, 1.6, 2.6, 3.6]), True),
]

states = torch.tensor([e[0] for e in experiences], dtype=torch.float32)
actions = torch.tensor([e[1] for e in experiences], dtype=torch.long)
rewards = torch.tensor([e[2] for e in experiences], dtype=torch.float32)
next_states = torch.tensor([e[3] for e in experiences], dtype=torch.float32)
dones = torch.tensor([e[4] for e in experiences], dtype=torch.bool)

print(f"상태 배치: {states.shape}")
print(f"행동 배치: {actions.shape}")

Target Network Copy

# DQN에서 타겟 네트워크를 주기적으로 업데이트
online_net = SimpleNetwork(4, 128, 2)
target_net = SimpleNetwork(4, 128, 2)

# 하드 업데이트: 가중치 전체 복사
target_net.load_state_dict(online_net.state_dict())

# 소프트 업데이트: 가중치를 부드럽게 보간
tau = 0.005
for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
    target_param.data.copy_(tau * online_param.data + (1 - tau) * target_param.data)

Summary

Here are the key PyTorch concepts covered in this article:

Tensors: Multidimensional arrays supporting GPU computation and automatic differentiation
Automatic differentiation: Automatic gradient computation with requires_grad=True and .backward()
nn.Module: The basic unit of neural network construction, implementing the forward() method
Loss functions: Choosing the appropriate loss for the objective -- MSE, Huber, Cross-Entropy, etc.
Optimizers: Updating parameters with Adam, SGD, RMSprop, etc.
TensorBoard: Visual monitoring of the training process

In the next article, we will build on these PyTorch basics to implement the Cross-Entropy method and solve CartPole.