Chaos and Order

💡 왼쪽 원문을 읽으면서 오른쪽에 따라 써보세요. Tab 키로 힌트를 받을 수 있습니다.

개요

지금까지 다룬 강화학습 알고리즘들은 모두 그래디언트 기반이었다. 정책 네트워크의 파라미터를 역전파(backpropagation)로 업데이트했다. 하지만 보상 함수가 미분 불가능하거나, 환경이 블랙박스인 경우에는 어떻게 할까?

Black-Box 최적화는 목적 함수의 내부 구조를 모르는 채로, 입력-출력 관계만으로 최적화하는 방법이다. 진화 전략(Evolution Strategies, ES)과 유전 알고리즘(Genetic Algorithms, GA)이 대표적이다.

Black-Box 방법의 특징

그래디언트 기반 vs Black-Box

항목	그래디언트 기반	Black-Box
미분 필요성	목적 함수가 미분 가능해야 함	불필요
정보 활용	그래디언트 (방향 + 크기)	함수값만 사용
확장성	수백만 파라미터 가능	파라미터 수에 민감
병렬화	제한적 (동기화 필요)	자연스러운 병렬화
탐색 성격	로컬 탐색	글로벌 탐색 가능

장점

보상 함수의 미분 불가능해도 적용 가능
환경의 내부 구조를 알 필요 없음
대규모 병렬화가 자연스러움
지역 최적에 빠지기 어려움

단점

샘플 효율성이 낮음
고차원 파라미터 공간에서 느림
그래디언트 정보를 활용하지 못함

진화 전략 (Evolution Strategies)

기본 원리

ES는 파라미터 공간에서 무작위 변이(perturbation) 를 만들고, 좋은 성능을 보인 변이 방향으로 파라미터를 업데이트한다. 놀랍게도 이 과정이 그래디언트 추정과 수학적으로 동치라는 것이 밝혀졌다.

ES의 수학적 직관

파라미터 theta에 가우시안 노이즈 epsilon을 더한 theta + sigma * epsilon에서의 기대 보상을 최대화한다. 이때 보상의 기대값에 대한 theta의 그래디언트를 몬테카를로 추정하면, 결국 보상으로 가중된 노이즈 방향의 평균이 된다.

CartPole에서의 ES 구현

import torch
import torch.nn as nn
import numpy as np
import gymnasium as gym

class ESPolicy(nn.Module):
    """ES용 정책 네트워크"""
    def __init__(self, obs_size, act_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, 64),
            nn.Tanh(),
            nn.Linear(64, act_size),
        )

    def forward(self, obs):
        return self.net(obs)

    def get_action(self, obs):
        with torch.no_grad():
            logits = self.forward(torch.FloatTensor(obs))
            return logits.argmax().item()

def evaluate_policy(env, policy, num_episodes=5):
    """정책의 평균 보상 계산"""
    total_reward = 0.0
    for _ in range(num_episodes):
        obs, _ = env.reset()
        done = False
        while not done:
            action = policy.get_action(obs)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
    return total_reward / num_episodes

def train_es_cartpole(pop_size=50, num_generations=100,
                      sigma=0.1, lr=0.01):
    """CartPole에서 ES 학습"""
    env = gym.make('CartPole-v1')
    obs_size = env.observation_space.shape[0]
    act_size = env.action_space.n

    # 기본 정책
    policy = ESPolicy(obs_size, act_size)
    base_params = nn.utils.parameters_to_vector(
        policy.parameters()
    ).detach()

    for gen in range(num_generations):
        # 1. 무작위 변이 생성
        noise = torch.randn(pop_size, len(base_params))

        # 2. 각 변이에 대해 보상 평가
        rewards = []
        for i in range(pop_size):
            # 양방향 변이 (미러링)
            for sign in [1, -1]:
                perturbed = base_params + sign * sigma * noise[i]
                nn.utils.vector_to_parameters(perturbed, policy.parameters())
                reward = evaluate_policy(env, policy, num_episodes=3)
                rewards.append((reward, sign, i))

        # 3. 보상으로 가중된 업데이트 방향 계산
        rewards_array = np.array([r[0] for r in rewards])

        # 보상 정규화 (rank-based)
        ranked = np.argsort(np.argsort(-rewards_array)).astype(np.float32)
        ranked = (ranked - ranked.mean()) / (ranked.std() + 1e-8)

        # 그래디언트 추정
        grad = torch.zeros_like(base_params)
        for idx, (_, sign, noise_idx) in enumerate(rewards):
            grad += ranked[idx] * sign * noise[noise_idx]
        grad /= (2 * pop_size * sigma)

        # 4. 파라미터 업데이트
        base_params += lr * grad

        # 성능 확인
        nn.utils.vector_to_parameters(base_params, policy.parameters())
        avg_reward = evaluate_policy(env, policy, num_episodes=10)
        if gen % 10 == 0:
            print(f"세대 {gen}: 평균 보상 = {avg_reward:.1f}")

        if avg_reward >= 495:
            print(f"세대 {gen}에서 해결!")
            break

    return policy

HalfCheetah에서의 ES

연속 행동 공간에서의 ES는 더 많은 인구(population)와 세대(generation)가 필요하다:

class ESContinuousPolicy(nn.Module):
    def __init__(self, obs_size, act_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, 256),
            nn.Tanh(),
            nn.Linear(256, 256),
            nn.Tanh(),
            nn.Linear(256, act_size),
            nn.Tanh(),
        )

    def forward(self, obs):
        return self.net(obs)

    def get_action(self, obs):
        with torch.no_grad():
            return self.forward(torch.FloatTensor(obs)).numpy()

def train_es_continuous(env_name='HalfCheetah-v4', pop_size=200,
                        num_generations=500, sigma=0.02, lr=0.01):
    """연속 환경에서 ES 학습 (병렬화 가능)"""
    env = gym.make(env_name)
    obs_size = env.observation_space.shape[0]
    act_size = env.action_space.shape[0]

    policy = ESContinuousPolicy(obs_size, act_size)
    base_params = nn.utils.parameters_to_vector(
        policy.parameters()
    ).detach()

    # Adam 모멘텀 추적
    m = torch.zeros_like(base_params)  # 1차 모멘트
    v = torch.zeros_like(base_params)  # 2차 모멘트
    beta1, beta2 = 0.9, 0.999

    for gen in range(num_generations):
        noise = torch.randn(pop_size, len(base_params))
        rewards_pos = np.zeros(pop_size)
        rewards_neg = np.zeros(pop_size)

        # 병렬화 가능한 평가 루프
        for i in range(pop_size):
            # 양방향 변이 평가
            params_pos = base_params + sigma * noise[i]
            nn.utils.vector_to_parameters(params_pos, policy.parameters())
            rewards_pos[i] = evaluate_policy(env, policy, num_episodes=1)

            params_neg = base_params - sigma * noise[i]
            nn.utils.vector_to_parameters(params_neg, policy.parameters())
            rewards_neg[i] = evaluate_policy(env, policy, num_episodes=1)

        # 보상 차이로 그래디언트 추정
        all_rewards = np.concatenate([rewards_pos, rewards_neg])
        reward_mean = all_rewards.mean()
        reward_std = all_rewards.std() + 1e-8

        grad = torch.zeros_like(base_params)
        for i in range(pop_size):
            normalized = (rewards_pos[i] - rewards_neg[i]) / reward_std
            grad += normalized * noise[i]
        grad /= (2 * pop_size * sigma)

        # Adam 업데이트
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * grad ** 2
        m_hat = m / (1 - beta1 ** (gen + 1))
        v_hat = v / (1 - beta2 ** (gen + 1))

        base_params += lr * m_hat / (torch.sqrt(v_hat) + 1e-8)

        nn.utils.vector_to_parameters(base_params, policy.parameters())
        if gen % 10 == 0:
            avg = evaluate_policy(env, policy, num_episodes=5)
            print(f"세대 {gen}: 평균 보상 = {avg:.1f}")

    return policy

유전 알고리즘 (Genetic Algorithm)

GA 기본 원리

유전 알고리즘은 생물학적 진화를 모방한다:

인구(Population): 여러 개의 정책(개체)을 유지
적합도 평가(Fitness): 각 개체의 보상을 측정
선택(Selection): 적합도가 높은 개체를 선택
교차(Crossover): 선택된 개체들의 파라미터를 혼합
변이(Mutation): 소량의 무작위 변화 추가

CartPole에서의 GA 구현

import copy

def train_ga_cartpole(pop_size=100, num_generations=50,
                      elite_ratio=0.1, mutation_rate=0.1,
                      mutation_sigma=0.1):
    """유전 알고리즘으로 CartPole 학습"""
    env = gym.make('CartPole-v1')
    obs_size = env.observation_space.shape[0]
    act_size = env.action_space.n

    # 초기 인구 생성
    population = [ESPolicy(obs_size, act_size) for _ in range(pop_size)]
    num_elite = max(int(pop_size * elite_ratio), 1)

    for gen in range(num_generations):
        # 1. 적합도 평가
        fitness = []
        for individual in population:
            reward = evaluate_policy(env, individual, num_episodes=5)
            fitness.append(reward)

        # 정렬 (적합도 내림차순)
        sorted_indices = np.argsort(fitness)[::-1]
        best_fitness = fitness[sorted_indices[0]]

        if gen % 5 == 0:
            print(f"세대 {gen}: 최고 적합도 = {best_fitness:.1f}, "
                  f"평균 = {np.mean(fitness):.1f}")

        if best_fitness >= 495:
            print(f"세대 {gen}에서 해결!")
            return population[sorted_indices[0]]

        # 2. 엘리트 선택
        elites = [copy.deepcopy(population[i]) for i in sorted_indices[:num_elite]]

        # 3. 다음 세대 생성
        new_population = list(elites)  # 엘리트는 그대로 유지

        while len(new_population) < pop_size:
            # 토너먼트 선택
            parent1 = tournament_select(population, fitness)
            parent2 = tournament_select(population, fitness)

            # 교차
            child = crossover(parent1, parent2)

            # 변이
            mutate(child, mutation_rate, mutation_sigma)

            new_population.append(child)

        population = new_population

    return population[sorted_indices[0]]

def tournament_select(population, fitness, k=3):
    """토너먼트 선택: k개 중 최고 적합도 선택"""
    indices = np.random.choice(len(population), k, replace=False)
    best_idx = indices[np.argmax([fitness[i] for i in indices])]
    return copy.deepcopy(population[best_idx])

def crossover(parent1, parent2):
    """균일 교차: 각 파라미터를 50% 확률로 선택"""
    child = copy.deepcopy(parent1)
    for c_param, p2_param in zip(child.parameters(), parent2.parameters()):
        mask = torch.rand_like(c_param) > 0.5
        c_param.data[mask] = p2_param.data[mask]
    return child

def mutate(individual, rate=0.1, sigma=0.1):
    """가우시안 변이"""
    for param in individual.parameters():
        mask = torch.rand_like(param) < rate
        noise = torch.randn_like(param) * sigma
        param.data += mask.float() * noise

GA 개선 기법

Deep GA

Uber AI Labs의 Deep GA는 대규모 신경망에 GA를 적용하기 위한 기법이다:

class DeepGA:
    """Deep GA: 파라미터 인코딩으로 메모리 효율적 GA"""

    def __init__(self, policy_fn, pop_size=1000, seed_size=5):
        self.pop_size = pop_size
        self.policy_fn = policy_fn

        # 각 개체를 시드 시퀀스로 인코딩 (메모리 절약)
        self.population = []
        for _ in range(pop_size):
            seeds = [np.random.randint(0, 2**31) for _ in range(seed_size)]
            self.population.append(seeds)

    def decode(self, seed_sequence):
        """시드 시퀀스에서 정책 파라미터 복원"""
        policy = self.policy_fn()
        base_params = nn.utils.parameters_to_vector(
            policy.parameters()
        ).detach()

        params = torch.zeros_like(base_params)
        for seed in seed_sequence:
            rng = np.random.RandomState(seed)
            noise = torch.FloatTensor(
                rng.randn(len(base_params))
            ) * 0.01
            params += noise

        nn.utils.vector_to_parameters(params, policy.parameters())
        return policy

    def evolve(self, fitness_fn, num_generations=100):
        for gen in range(num_generations):
            # 적합도 평가
            fitness = []
            for seeds in self.population:
                policy = self.decode(seeds)
                fitness.append(fitness_fn(policy))

            # 변이: 시드 하나 추가
            sorted_idx = np.argsort(fitness)[::-1]
            elites = [self.population[i] for i in sorted_idx[:10]]

            new_pop = [list(e) for e in elites]
            while len(new_pop) < self.pop_size:
                parent = elites[np.random.randint(len(elites))]
                child = list(parent) + [np.random.randint(0, 2**31)]
                new_pop.append(child)

            self.population = new_pop

            if gen % 10 == 0:
                print(f"세대 {gen}: 최고 = {max(fitness):.1f}")

신규성 탐색 (Novelty Search)

적합도 대신 행동의 신규성을 기준으로 선택하는 방법이다. 새로운 행동 패턴을 보이는 개체가 선호되어, 다양한 전략을 탐색한다:

class NoveltySearch:
    """신규성 탐색: 행동 다양성으로 개체 선택"""

    def __init__(self, archive_size=1000, k_nearest=15):
        self.archive = []
        self.archive_size = archive_size
        self.k = k_nearest

    def behavior_characterization(self, trajectory):
        """에이전트의 행동을 특성 벡터로 요약"""
        # 예: 최종 위치, 방문한 상태의 통계 등
        if len(trajectory) == 0:
            return np.zeros(4)

        states = np.array(trajectory)
        return np.array([
            states[:, 0].mean(),  # 평균 x 위치
            states[:, 1].mean(),  # 평균 y 위치
            states[:, 0].std(),   # x 분산
            states[:, 1].std(),   # y 분산
        ])

    def novelty_score(self, behavior):
        """k-최근접 이웃과의 평균 거리 = 신규성 점수"""
        if len(self.archive) < self.k:
            return float('inf')

        distances = [np.linalg.norm(behavior - b)
                     for b in self.archive]
        distances.sort()
        return np.mean(distances[:self.k])

    def add_to_archive(self, behavior):
        self.archive.append(behavior)
        if len(self.archive) > self.archive_size:
            # 랜덤하게 하나 제거
            idx = np.random.randint(len(self.archive))
            self.archive.pop(idx)

ES vs GA 비교

항목	진화 전략 (ES)	유전 알고리즘 (GA)
업데이트 방식	그래디언트 추정 (연속적)	선택 + 교차 + 변이 (이산적)
수학적 기반	정규 분포 기반 최적화	생물학적 진화 모방
인구 다양성	단일 분포에서 샘플링	여러 개체가 독립적으로 존재
병렬화	매우 쉬움	쉬움
대규모 파라미터	비교적 잘 확장	교차 연산이 비효율적

핵심 요약

Black-Box 최적화는 그래디언트 없이 보상 신호만으로 정책을 최적화한다
ES는 무작위 변이의 보상으로 그래디언트를 추정하며, 대규모 병렬화에 유리하다
GA는 선택, 교차, 변이를 통한 인구 기반 최적화를 수행한다
Deep GA와 신규성 탐색으로 GA의 한계를 극복할 수 있다
그래디언트 기반 방법보다 샘플 효율은 낮지만, 특수한 환경에서 유용하다

다음 글에서는 환경 모델을 학습하여 활용하는 모델 기반 강화학습을 알아보겠다.