[Deep RL] 03. PyTorch Deep Learning Basics: From Tensors to Neural Networks

Introduction to PyTorch

PyTorch is a deep learning framework developed by Facebook (now Meta). It features dynamic computation graphs, an intuitive API, and powerful GPU support, making it one of the most widely used frameworks in reinforcement learning research.

Installation

pip install torch torchvision

Tensors

Tensors are the fundamental data structure of PyTorch, similar to NumPy arrays but with support for GPU computation and automatic differentiation.

Creating Tensors

import torch
import numpy as np

# 다양한 방법으로 텐서 생성
# 리스트로부터
t1 = torch.tensor([1, 2, 3])
print(f"리스트로부터: {t1}")

# 특정 값으로 초기화
t_zeros = torch.zeros(3, 4)
t_ones = torch.ones(2, 3)
t_rand = torch.rand(2, 3)  # 0~1 균일 분포
t_randn = torch.randn(2, 3)  # 표준 정규 분포

print(f"영 텐서:\n{t_zeros}")
print(f"랜덤 텐서:\n{t_rand}")

# NumPy 배열로부터
np_array = np.array([[1.0, 2.0], [3.0, 4.0]])
t_from_np = torch.from_numpy(np_array)
print(f"NumPy로부터: {t_from_np}")

# 텐서를 NumPy로 변환
back_to_np = t_from_np.numpy()
print(f"다시 NumPy로: {back_to_np}")

Tensor Properties

t = torch.randn(3, 4, 5)

print(f"형상 (shape): {t.shape}")
print(f"데이터 타입 (dtype): {t.dtype}")
print(f"장치 (device): {t.device}")
print(f"차원 수 (ndim): {t.ndim}")
print(f"전체 원소 수 (numel): {t.numel()}")

Tensor Operations

a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

# 기본 연산
print(f"덧셈: {a + b}")
print(f"곱셈 (원소별): {a * b}")
print(f"행렬 곱: {a @ b}")
print(f"행렬 곱 (동일): {torch.matmul(a, b)}")

# 형상 변환
t = torch.arange(12)
print(f"원본: {t.shape}")

t_reshaped = t.reshape(3, 4)
print(f"reshape(3,4): {t_reshaped.shape}")

t_viewed = t.view(2, 6)
print(f"view(2,6): {t_viewed.shape}")

# 차원 추가/제거
t = torch.randn(3, 4)
t_unsqueeze = t.unsqueeze(0)  # 배치 차원 추가
print(f"unsqueeze(0): {t_unsqueeze.shape}")  # (1, 3, 4)

t_squeeze = t_unsqueeze.squeeze(0)  # 크기 1인 차원 제거
print(f"squeeze(0): {t_squeeze.shape}")  # (3, 4)

# 인덱싱과 슬라이싱
t = torch.randn(4, 5)
print(f"첫 번째 행: {t[0]}")
print(f"마지막 열: {t[:, -1]}")
print(f"2x3 부분: {t[:2, :3].shape}")

GPU Computation

Using GPU in PyTorch allows large-scale tensor operations to be processed in parallel.

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# MPS (Apple Silicon) 지원 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Apple Silicon GPU 사용")

# 텐서를 GPU로 이동
t = torch.randn(1000, 1000)
t_gpu = t.to(device)
print(f"텐서 장치: {t_gpu.device}")

# GPU에서 연산
a = torch.randn(1000, 1000, device=device)
b = torch.randn(1000, 1000, device=device)
c = a @ b  # GPU에서 행렬 곱 수행

# 결과를 CPU로 다시 이동
c_cpu = c.cpu()

Gradients and Automatic Differentiation (Autograd)

One of PyTorch's most powerful features is automatic differentiation. It tracks all operations on tensors with requires_grad=True and automatically computes gradients.

Basic Automatic Differentiation

# requires_grad=True로 그래디언트 추적 활성화
x = torch.tensor([2.0, 3.0], requires_grad=True)

# 순방향 계산: y = x^2 + 3x
y = x ** 2 + 3 * x
print(f"y = {y}")

# 역전파: dy/dx = 2x + 3
z = y.sum()
z.backward()

print(f"x의 그래디언트: {x.grad}")
# x=2일 때 dy/dx = 2*2+3 = 7
# x=3일 때 dy/dx = 2*3+3 = 9
# 출력: tensor([7., 9.])

Gradient Computation Control

# 그래디언트 추적 비활성화 (추론 시 사용)
x = torch.randn(3, requires_grad=True)

# 방법 1: torch.no_grad()
with torch.no_grad():
    y = x * 2
    print(f"requires_grad: {y.requires_grad}")  # False

# 방법 2: detach()
z = x.detach()
print(f"detach 후 requires_grad: {z.requires_grad}")  # False

# 그래디언트 초기화 (반복 학습 시 중요)
x = torch.tensor([1.0], requires_grad=True)

for i in range(3):
    y = x ** 2
    y.backward()
    print(f"반복 {i}: grad = {x.grad}")
    x.grad.zero_()  # 그래디언트 초기화 필수

Simple Optimization Example

# 그래디언트 하강법으로 f(x) = (x - 3)^2 최소화
x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1

for step in range(20):
    # 순방향: 손실 계산
    loss = (x - 3) ** 2

    # 역방향: 그래디언트 계산
    loss.backward()

    # 파라미터 업데이트 (그래디언트 추적 없이)
    with torch.no_grad():
        x -= learning_rate * x.grad

    # 그래디언트 초기화
    x.grad.zero_()

    if step % 5 == 0:
        print(f"스텝 {step}: x = {x.item():.4f}, loss = {loss.item():.6f}")

print(f"최종 x = {x.item():.4f} (목표: 3.0)")

Neural Network Components (nn Module)

PyTorch's torch.nn module provides all the building blocks needed for neural network construction.

Basic Neural Network Construction

import torch
import torch.nn as nn

class SimpleNetwork(nn.Module):
    """간단한 피드포워드 신경망"""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.network(x)

# 모델 생성 및 테스트
model = SimpleNetwork(input_size=4, hidden_size=128, output_size=2)
print(model)

# 가짜 입력으로 순방향 전파
x = torch.randn(32, 4)  # 배치 크기 32, 입력 차원 4
output = model(x)
print(f"출력 형상: {output.shape}")  # (32, 2)

# 파라미터 수 확인
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"전체 파라미터: {total_params:,}")
print(f"학습 가능 파라미터: {trainable_params:,}")

Key Layers

# 선형 레이어 (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5)

# 합성곱 레이어
conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)

# 배치 정규화
bn = nn.BatchNorm2d(num_features=32)

# 드롭아웃
dropout = nn.Dropout(p=0.5)

# 활성화 함수
relu = nn.ReLU()
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

Custom Layers

You can define your own layers by inheriting from nn.Module and implementing the forward method.

class NoisyLinear(nn.Module):
    """노이즈가 추가된 선형 레이어 (탐색 강화 목적)"""
    def __init__(self, in_features, out_features, noise_std=0.1):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.noise_std = noise_std

    def forward(self, x):
        if self.training:
            noise = torch.randn_like(self.linear.weight) * self.noise_std
            weight = self.linear.weight + noise
            return x @ weight.t() + self.linear.bias
        return self.linear(x)

class DuelingHead(nn.Module):
    """Dueling DQN 구조의 출력 헤드"""
    def __init__(self, input_size, n_actions):
        super().__init__()
        self.value_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )

    def forward(self, x):
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        # Q = V + (A - mean(A))
        q_values = value + advantage - advantage.mean(dim=-1, keepdim=True)
        return q_values

# 사용 예시
head = DuelingHead(input_size=256, n_actions=4)
features = torch.randn(16, 256)
q_values = head(features)
print(f"Q 값 형상: {q_values.shape}")  # (16, 4)

Loss Functions

Loss functions commonly used in reinforcement learning:

# MSE 손실 (가치 함수 학습)
mse_loss = nn.MSELoss()
predicted = torch.tensor([2.5, 0.0, -1.0])
target = torch.tensor([3.0, -0.5, -1.0])
loss = mse_loss(predicted, target)
print(f"MSE 손실: {loss.item():.4f}")

# Huber 손실 (DQN에서 많이 사용, MSE보다 이상치에 강건)
huber_loss = nn.SmoothL1Loss()
loss = huber_loss(predicted, target)
print(f"Huber 손실: {loss.item():.4f}")

# Cross-Entropy 손실 (정책 학습)
ce_loss = nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # 3개 행동의 로짓
target_action = torch.tensor([0])  # 정답 행동
loss = ce_loss(logits, target_action)
print(f"Cross-Entropy 손실: {loss.item():.4f}")

Optimizers

model = SimpleNetwork(4, 128, 2)

# SGD
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam (가장 많이 사용)
optimizer_adam = torch.optim.Adam(model.parameters(), lr=0.001)

# RMSprop (DQN 원논문에서 사용)
optimizer_rms = torch.optim.RMSprop(model.parameters(), lr=0.00025, alpha=0.95)

# 학습 루프 기본 구조
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    # 순방향 전파
    x = torch.randn(32, 4)
    target = torch.randn(32, 2)
    prediction = model(x)

    # 손실 계산
    loss = nn.MSELoss()(prediction, target)

    # 역방향 전파 및 파라미터 업데이트
    optimizer.zero_grad()  # 그래디언트 초기화
    loss.backward()        # 그래디언트 계산
    optimizer.step()       # 파라미터 업데이트

    if epoch % 20 == 0:
        print(f"에폭 {epoch}: 손실 = {loss.item():.4f}")

Learning Rate Schedulers

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# StepLR: 일정 에폭마다 학습률 감소
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

# ExponentialLR: 매 에폭 지수적 감소
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

# 사용 방법
for epoch in range(100):
    # ... 학습 코드 ...
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

Image Processing with CNN

Convolutional Neural Networks (CNNs) are essential for image-based reinforcement learning such as Atari games.

class AtariCNN(nn.Module):
    """Atari 게임용 CNN (DQN 논문 기반)"""
    def __init__(self, input_channels, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        # 84x84 입력 기준 conv 출력 크기 계산
        conv_output_size = self._get_conv_output_size(input_channels)

        self.fc = nn.Sequential(
            nn.Linear(conv_output_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
        )

    def _get_conv_output_size(self, input_channels):
        dummy = torch.zeros(1, input_channels, 84, 84)
        output = self.conv(dummy)
        return int(output.view(1, -1).shape[1])

    def forward(self, x):
        # 입력: (batch, channels, 84, 84)
        # 0-255 범위를 0-1로 정규화
        x = x.float() / 255.0
        conv_out = self.conv(x)
        flat = conv_out.view(conv_out.size(0), -1)
        return self.fc(flat)

# 테스트
model = AtariCNN(input_channels=4, n_actions=6)
dummy_input = torch.randint(0, 256, (8, 4, 84, 84), dtype=torch.uint8)
q_values = model(dummy_input)
print(f"Q값 형상: {q_values.shape}")  # (8, 6)

TensorBoard Monitoring

Visually monitoring the training process is very important.

from torch.utils.tensorboard import SummaryWriter
import time

# TensorBoard 기록기 생성
writer = SummaryWriter(log_dir="runs/rl_experiment")

# 스칼라 값 기록 (보상, 손실 등)
for step in range(1000):
    fake_loss = 1.0 / (step + 1)
    fake_reward = step * 0.1
    fake_epsilon = max(0.01, 1.0 - step * 0.001)

    writer.add_scalar("training/loss", fake_loss, step)
    writer.add_scalar("training/reward", fake_reward, step)
    writer.add_scalar("training/epsilon", fake_epsilon, step)

# 여러 값을 한 그래프에
writer.add_scalars("comparison", {
    "train_loss": 0.5,
    "val_loss": 0.7,
}, global_step=0)

# 모델 구조 기록
model = SimpleNetwork(4, 128, 2)
dummy_input = torch.randn(1, 4)
writer.add_graph(model, dummy_input)

# 히스토그램 (가중치 분포 확인)
for name, param in model.named_parameters():
    writer.add_histogram(name, param.data, global_step=0)

writer.close()

To run TensorBoard, enter the following command in the terminal:

tensorboard --logdir=runs

Generating Atari Images with GAN

To understand the basic concepts of GANs (Generative Adversarial Networks), let us implement a simple GAN that generates Atari game screens.

import torch
import torch.nn as nn

class Generator(nn.Module):
    """생성자: 랜덤 노이즈로부터 이미지 생성"""
    def __init__(self, latent_dim=100, img_channels=1, img_size=64):
        super().__init__()
        self.img_size = img_size

        self.model = nn.Sequential(
            # 입력: (batch, latent_dim)
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),

            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),

            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),

            nn.Linear(1024, img_channels * img_size * img_size),
            nn.Tanh(),
        )
        self.img_channels = img_channels

    def forward(self, z):
        img = self.model(z)
        return img.view(-1, self.img_channels, self.img_size, self.img_size)

class Discriminator(nn.Module):
    """판별자: 진짜/가짜 이미지 구별"""
    def __init__(self, img_channels=1, img_size=64):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(img_channels * img_size * img_size, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, img):
        flat = img.view(img.size(0), -1)
        return self.model(flat)

GAN Training Loop

def train_gan(n_epochs=50, batch_size=64, latent_dim=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    generator = Generator(latent_dim=latent_dim).to(device)
    discriminator = Discriminator().to(device)

    optimizer_g = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    criterion = nn.BCELoss()

    writer = SummaryWriter("runs/gan_atari")

    for epoch in range(n_epochs):
        # 가짜 Atari 스타일 데이터 (실제로는 데이터셋 사용)
        real_images = torch.randn(batch_size, 1, 64, 64).to(device)
        real_images = (real_images + 1) / 2  # 0~1 범위

        # 레이블
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)

        # 판별자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)

        d_real = discriminator(real_images)
        d_fake = discriminator(fake_images.detach())

        d_loss_real = criterion(d_real, real_labels)
        d_loss_fake = criterion(d_fake, fake_labels)
        d_loss = d_loss_real + d_loss_fake

        optimizer_d.zero_grad()
        d_loss.backward()
        optimizer_d.step()

        # 생성자 학습
        z = torch.randn(batch_size, latent_dim).to(device)
        fake_images = generator(z)
        d_fake = discriminator(fake_images)

        g_loss = criterion(d_fake, real_labels)

        optimizer_g.zero_grad()
        g_loss.backward()
        optimizer_g.step()

        if epoch % 10 == 0:
            print(f"에폭 {epoch}: D 손실={d_loss.item():.4f}, G 손실={g_loss.item():.4f}")
            writer.add_scalar("GAN/d_loss", d_loss.item(), epoch)
            writer.add_scalar("GAN/g_loss", g_loss.item(), epoch)

    writer.close()
    return generator, discriminator

PyTorch Tips for Reinforcement Learning

Saving and Loading Models

# 모델 저장
torch.save(model.state_dict(), "model.pth")

# 모델 불러오기
model = SimpleNetwork(4, 128, 2)
model.load_state_dict(torch.load("model.pth"))
model.eval()  # 추론 모드로 전환

Batch Processing

# 강화학습에서 경험 배치를 텐서로 변환
experiences = [
    (np.array([1.0, 2.0, 3.0, 4.0]), 1, 1.0, np.array([1.1, 2.1, 3.1, 4.1]), False),
    (np.array([0.5, 1.5, 2.5, 3.5]), 0, 0.0, np.array([0.6, 1.6, 2.6, 3.6]), True),
]

states = torch.tensor([e[0] for e in experiences], dtype=torch.float32)
actions = torch.tensor([e[1] for e in experiences], dtype=torch.long)
rewards = torch.tensor([e[2] for e in experiences], dtype=torch.float32)
next_states = torch.tensor([e[3] for e in experiences], dtype=torch.float32)
dones = torch.tensor([e[4] for e in experiences], dtype=torch.bool)

print(f"상태 배치: {states.shape}")
print(f"행동 배치: {actions.shape}")

Target Network Copy

# DQN에서 타겟 네트워크를 주기적으로 업데이트
online_net = SimpleNetwork(4, 128, 2)
target_net = SimpleNetwork(4, 128, 2)

# 하드 업데이트: 가중치 전체 복사
target_net.load_state_dict(online_net.state_dict())

# 소프트 업데이트: 가중치를 부드럽게 보간
tau = 0.005
for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
    target_param.data.copy_(tau * online_param.data + (1 - tau) * target_param.data)

Summary

Here are the key PyTorch concepts covered in this article:

Tensors: Multidimensional arrays supporting GPU computation and automatic differentiation
Automatic differentiation: Automatic gradient computation with requires_grad=True and .backward()
nn.Module: The basic unit of neural network construction, implementing the forward() method
Loss functions: Choosing the appropriate loss for the objective -- MSE, Huber, Cross-Entropy, etc.
Optimizers: Updating parameters with Adam, SGD, RMSprop, etc.
TensorBoard: Visual monitoring of the training process

In the next article, we will build on these PyTorch basics to implement the Cross-Entropy method and solve CartPole.