[深層強化学習] 08. 強化学習で株式トレーディングを作る

トレーディングと強化学習

株式トレーディングは強化学習の自然な応用分野です。トレーダー（エージェント）が市場（環境）で売買/保有（行動）を決定し、利益（報酬）を得る構造は強化学習フレームワークと正確に一致します。

注意事項

この記事は教育目的の実習であり、実際の投資に直接適用することは推奨しません。実際の金融市場はここで扱うものよりはるかに複雑です。

トレーディング基礎概念

基本用語

買い（Buy/Long）: 株式を購入して価格上昇に賭ける
売り（Sell/Short）: 保有株式を売却してポジションを清算
ポジション（Position）: 現在保有している株式の状態
収益率（Return）: 投資対比利益の比率
手数料（Commission）: 取引時に発生するコスト
スリッページ（Slippage）: 注文価格と実際の約定価格の差

強化学習でトレーディングをモデリング

RL要素	トレーディング対応
状態	過去の価格データ、テクニカル指標、現在ポジション
行動	買い、売り、保有
報酬	実現利益、未実現損益変化量
エピソード	一定期間のトレーディングセッション

データ準備

価格データ生成

実習のために合成データを生成します。実際の適用時にはYahoo Financeなどからデータを取得できます。

import numpy as np
import pandas as pd

def generate_stock_data(n_days=1000, initial_price=100.0, volatility=0.02, seed=42):
    """합성 주가 데이터 생성 (기하 브라운 운동 모델)"""
    np.random.seed(seed)
    daily_returns = np.random.normal(0.0005, volatility, n_days)
    prices = initial_price * np.cumprod(1 + daily_returns)
    data = pd.DataFrame()
    data['close'] = prices
    data['open'] = prices * (1 + np.random.normal(0, 0.005, n_days))
    data['high'] = np.maximum(data['open'], data['close']) * (1 + np.abs(np.random.normal(0, 0.01, n_days)))
    data['low'] = np.minimum(data['open'], data['close']) * (1 - np.abs(np.random.normal(0, 0.01, n_days)))
    data['volume'] = np.random.randint(100000, 1000000, n_days).astype(float)
    return data

stock_data = generate_stock_data(n_days=2000)
print(f"데이터 크기: {len(stock_data)}")
print(stock_data.head())

テクニカル指標計算

def add_technical_indicators(df):
    """기술적 지표 추가"""
    df['sma_10'] = df['close'].rolling(window=10).mean()
    df['sma_30'] = df['close'].rolling(window=30).mean()
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / (loss + 1e-10)
    df['rsi'] = 100 - (100 / (1 + rs))
    bb_mean = df['close'].rolling(window=20).mean()
    bb_std = df['close'].rolling(window=20).std()
    df['bb_upper'] = bb_mean + 2 * bb_std
    df['bb_lower'] = bb_mean - 2 * bb_std
    df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'] + 1e-10)
    ema12 = df['close'].ewm(span=12).mean()
    ema26 = df['close'].ewm(span=26).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['returns'] = df['close'].pct_change()
    df['returns_5d'] = df['close'].pct_change(5)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

stock_data = add_technical_indicators(stock_data)
print(f"지표 추가 후 데이터 크기: {len(stock_data)}")
print(f"특성 목록: {list(stock_data.columns)}")

トレーディング環境設計

Gymnasiumインターフェースに従うカスタムトレーディング環境を実装します。

import gymnasium as gym
from gymnasium import spaces

class StockTradingEnv(gym.Env):
    """주식 트레이딩 환경"""
    metadata = {"render_modes": ["human"]}

    def __init__(self, df, window_size=30, commission=0.001, initial_balance=100000):
        super().__init__()
        self.df = df
        self.window_size = window_size
        self.commission = commission
        self.initial_balance = initial_balance
        self.feature_columns = [
            'close', 'volume', 'sma_10', 'sma_30', 'rsi',
            'bb_position', 'macd', 'macd_signal', 'returns', 'returns_5d'
        ]
        self.n_features = len(self.feature_columns)
        self.action_space = spaces.Discrete(3)
        obs_shape = self.window_size * self.n_features + 3
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_shape,), dtype=np.float32)

    def _get_observation(self):
        """현재 관찰값 생성"""
        start = self.current_step - self.window_size
        end = self.current_step
        window_data = self.df[self.feature_columns].iloc[start:end].values
        for i in range(self.n_features):
            col = window_data[:, i]
            min_val = col.min()
            max_val = col.max()
            if max_val - min_val > 0:
                window_data[:, i] = (col - min_val) / (max_val - min_val)
            else:
                window_data[:, i] = 0.0
        flat_window = window_data.flatten()
        position_info = np.array([
            1.0 if self.position > 0 else 0.0,
            self.unrealized_pnl / self.initial_balance,
            self.shares * self.current_price / self.total_value,
        ], dtype=np.float32)
        return np.concatenate([flat_window, position_info]).astype(np.float32)

    @property
    def current_price(self):
        return self.df['close'].iloc[self.current_step]

    @property
    def unrealized_pnl(self):
        if self.position > 0:
            return self.shares * (self.current_price - self.entry_price)
        return 0.0

    @property
    def total_value(self):
        return self.balance + self.shares * self.current_price

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares = 0
        self.position = 0
        self.entry_price = 0.0
        self.total_trades = 0
        self.winning_trades = 0
        self.trade_history = []
        return self._get_observation(), {}

    def step(self, action):
        prev_total = self.total_value
        reward = 0.0
        trade_info = ""
        current_price = self.current_price

        if action == 1 and self.position == 0:
            max_shares = int(self.balance * 0.95 / (current_price * (1 + self.commission)))
            if max_shares > 0:
                cost = max_shares * current_price * (1 + self.commission)
                self.balance -= cost
                self.shares = max_shares
                self.position = 1
                self.entry_price = current_price
        elif action == 2 and self.position == 1:
            proceeds = self.shares * current_price * (1 - self.commission)
            self.balance += proceeds
            pnl = (current_price - self.entry_price) / self.entry_price
            self.total_trades += 1
            if pnl > 0:
                self.winning_trades += 1
            self.trade_history.append(pnl)
            self.shares = 0
            self.position = 0
            self.entry_price = 0.0

        self.current_step += 1
        current_total = self.total_value
        reward = (current_total - prev_total) / prev_total
        terminated = self.current_step >= len(self.df) - 1
        truncated = self.total_value < self.initial_balance * 0.5
        info = {"total_value": self.total_value, "balance": self.balance, "position": self.position, "total_trades": self.total_trades, "trade_info": trade_info}
        return self._get_observation(), reward, terminated, truncated, info

env = StockTradingEnv(stock_data, window_size=30)
obs, info = env.reset()
print(f"관찰 차원: {obs.shape}")
print(f"초기 포트폴리오: {env.total_value:,.0f}원")

フィードフォワードDQNモデル

import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

class TradingDQN(nn.Module):
    """트레이딩용 피드포워드 DQN"""
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, n_actions),
        )
    def forward(self, x):
        return self.net(x)

CNNモデル：価格チャートを画像のように処理

価格データの時間的パターンを1D畳み込みで捉えます。

class TradingCNN(nn.Module):
    """1D CNN 기반 트레이딩 모델"""
    def __init__(self, window_size, n_features, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(n_features, 32, kernel_size=5, padding=2), nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
        )
        self.fc = nn.Sequential(nn.Linear(64 + 3, 64), nn.ReLU(), nn.Linear(64, n_actions))
        self.window_size = window_size
        self.n_features = n_features

    def forward(self, x):
        batch_size = x.shape[0]
        window_data = x[:, :-3].view(batch_size, self.window_size, self.n_features)
        position_info = x[:, -3:]
        window_data = window_data.transpose(1, 2)
        conv_out = self.conv(window_data).squeeze(-1)
        combined = torch.cat([conv_out, position_info], dim=1)
        return self.fc(combined)

まとめ

問題定義: 株式トレーディングをMDPとしてモデル化（状態=市場データ、行動=売買/保有、報酬=利益）
データ準備: 価格データにテクニカル指標を追加して観測空間を構成
環境設計: Gymnasiumインターフェースのカスタムトレーディング環境を実装
モデル: フィードフォワードDQNと1D CNNモデルの両方が活用可能
報酬設計: 単純な収益率以外にリスク調整、取引ペナルティなど多様な要素を考慮
評価: 収益率、シャープレシオ、最大ドローダウンなど多様な成果指標で評価

次の記事では価値ベースの方法から離れて方策を直接最適化するPolicy Gradient方法を見ていきます。