- Authors

- Name
- Youngju Kim
- @fjvbn20031
Trading and Reinforcement Learning
Stock trading is a natural application of reinforcement learning. The structure where a trader (agent) decides to buy/sell/hold (actions) in the market (environment) and earns profit (reward) matches the RL framework exactly.
Disclaimer
This article is for educational purposes only, and applying it directly to real investments is not recommended. Real financial markets are far more complex than what is covered here.
Trading Basics
Basic Terminology
- Buy/Long: Purchase stocks betting on price increase
- Sell/Short: Sell held stocks to close a position
- Position: Current state of held stocks
- Return: Ratio of profit to investment
- Commission: Cost incurred during transactions
- Slippage: Difference between order price and actual execution price
Modeling Trading as Reinforcement Learning
| RL Element | Trading Counterpart |
|---|---|
| State | Historical price data, technical indicators, current position |
| Action | Buy, Sell, Hold |
| Reward | Realized profit, unrealized P&L change |
| Episode | A trading session over a given period |
Data Preparation
Price Data Generation
We generate synthetic data for practice. For real applications, data can be fetched from Yahoo Finance, etc.
import numpy as np
import pandas as pd
def generate_stock_data(n_days=1000, initial_price=100.0, volatility=0.02, seed=42):
"""합성 주가 데이터 생성 (기하 브라운 운동 모델)"""
np.random.seed(seed)
daily_returns = np.random.normal(0.0005, volatility, n_days)
prices = initial_price * np.cumprod(1 + daily_returns)
data = pd.DataFrame()
data['close'] = prices
data['open'] = prices * (1 + np.random.normal(0, 0.005, n_days))
data['high'] = np.maximum(data['open'], data['close']) * (1 + np.abs(np.random.normal(0, 0.01, n_days)))
data['low'] = np.minimum(data['open'], data['close']) * (1 - np.abs(np.random.normal(0, 0.01, n_days)))
data['volume'] = np.random.randint(100000, 1000000, n_days).astype(float)
return data
stock_data = generate_stock_data(n_days=2000)
print(f"데이터 크기: {len(stock_data)}")
print(stock_data.head())
Technical Indicators
def add_technical_indicators(df):
"""기술적 지표 추가"""
df['sma_10'] = df['close'].rolling(window=10).mean()
df['sma_30'] = df['close'].rolling(window=30).mean()
delta = df['close'].diff()
gain = delta.where(delta > 0, 0).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / (loss + 1e-10)
df['rsi'] = 100 - (100 / (1 + rs))
bb_mean = df['close'].rolling(window=20).mean()
bb_std = df['close'].rolling(window=20).std()
df['bb_upper'] = bb_mean + 2 * bb_std
df['bb_lower'] = bb_mean - 2 * bb_std
df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'] + 1e-10)
ema12 = df['close'].ewm(span=12).mean()
ema26 = df['close'].ewm(span=26).mean()
df['macd'] = ema12 - ema26
df['macd_signal'] = df['macd'].ewm(span=9).mean()
df['returns'] = df['close'].pct_change()
df['returns_5d'] = df['close'].pct_change(5)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
return df
stock_data = add_technical_indicators(stock_data)
print(f"지표 추가 후 데이터 크기: {len(stock_data)}")
print(f"특성 목록: {list(stock_data.columns)}")
Trading Environment Design
We implement a custom trading environment following the Gymnasium interface.
import gymnasium as gym
from gymnasium import spaces
class StockTradingEnv(gym.Env):
"""주식 트레이딩 환경"""
metadata = {"render_modes": ["human"]}
def __init__(self, df, window_size=30, commission=0.001,
initial_balance=100000):
super().__init__()
self.df = df
self.window_size = window_size
self.commission = commission
self.initial_balance = initial_balance
self.feature_columns = [
'close', 'volume', 'sma_10', 'sma_30', 'rsi',
'bb_position', 'macd', 'macd_signal', 'returns', 'returns_5d'
]
self.n_features = len(self.feature_columns)
self.action_space = spaces.Discrete(3)
obs_shape = self.window_size * self.n_features + 3
self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=(obs_shape,), dtype=np.float32
)
def _get_observation(self):
"""현재 관찰값 생성"""
start = self.current_step - self.window_size
end = self.current_step
window_data = self.df[self.feature_columns].iloc[start:end].values
for i in range(self.n_features):
col = window_data[:, i]
min_val = col.min()
max_val = col.max()
if max_val - min_val > 0:
window_data[:, i] = (col - min_val) / (max_val - min_val)
else:
window_data[:, i] = 0.0
flat_window = window_data.flatten()
position_info = np.array([
1.0 if self.position > 0 else 0.0,
self.unrealized_pnl / self.initial_balance,
self.shares * self.current_price / self.total_value,
], dtype=np.float32)
return np.concatenate([flat_window, position_info]).astype(np.float32)
@property
def current_price(self):
return self.df['close'].iloc[self.current_step]
@property
def unrealized_pnl(self):
if self.position > 0:
return self.shares * (self.current_price - self.entry_price)
return 0.0
@property
def total_value(self):
return self.balance + self.shares * self.current_price
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.current_step = self.window_size
self.balance = self.initial_balance
self.shares = 0
self.position = 0
self.entry_price = 0.0
self.total_trades = 0
self.winning_trades = 0
self.trade_history = []
return self._get_observation(), {}
def step(self, action):
prev_total = self.total_value
reward = 0.0
trade_info = ""
current_price = self.current_price
if action == 1 and self.position == 0:
max_shares = int(self.balance * 0.95 / (current_price * (1 + self.commission)))
if max_shares > 0:
cost = max_shares * current_price * (1 + self.commission)
self.balance -= cost
self.shares = max_shares
self.position = 1
self.entry_price = current_price
trade_info = f"매수 {max_shares}주 @ {current_price:.2f}"
elif action == 2 and self.position == 1:
proceeds = self.shares * current_price * (1 - self.commission)
self.balance += proceeds
pnl = (current_price - self.entry_price) / self.entry_price
self.total_trades += 1
if pnl > 0:
self.winning_trades += 1
self.trade_history.append(pnl)
trade_info = f"매도 {self.shares}주 @ {current_price:.2f}, 수익률: {pnl:.2%}"
self.shares = 0
self.position = 0
self.entry_price = 0.0
self.current_step += 1
current_total = self.total_value
reward = (current_total - prev_total) / prev_total
terminated = self.current_step >= len(self.df) - 1
truncated = self.total_value < self.initial_balance * 0.5
info = {
"total_value": self.total_value,
"balance": self.balance,
"position": self.position,
"total_trades": self.total_trades,
"trade_info": trade_info,
}
return self._get_observation(), reward, terminated, truncated, info
env = StockTradingEnv(stock_data, window_size=30)
obs, info = env.reset()
print(f"관찰 차원: {obs.shape}")
print(f"초기 포트폴리오: {env.total_value:,.0f}원")
Random Agent Baseline
def evaluate_random_agent(env, n_episodes=10):
"""무작위 에이전트 평가"""
results = []
for episode in range(n_episodes):
obs, _ = env.reset()
while True:
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
break
final_value = info['total_value']
total_return = (final_value - env.initial_balance) / env.initial_balance
results.append({'final_value': final_value, 'return': total_return, 'trades': info['total_trades']})
returns = [r['return'] for r in results]
print(f"=== 무작위 에이전트 ({n_episodes}회) ===")
print(f"평균 수익률: {np.mean(returns):.2%}")
print(f"최대 수익률: {np.max(returns):.2%}")
print(f"최소 수익률: {np.min(returns):.2%}")
return results
# random_results = evaluate_random_agent(env)
Feedforward DQN Model
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
class TradingDQN(nn.Module):
"""트레이딩용 피드포워드 DQN"""
def __init__(self, obs_size, n_actions):
super().__init__()
self.net = nn.Sequential(
nn.Linear(obs_size, 256), nn.ReLU(), nn.Dropout(0.2),
nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.2),
nn.Linear(128, 64), nn.ReLU(),
nn.Linear(64, n_actions),
)
def forward(self, x):
return self.net(x)
CNN Model: Processing Price Charts Like Images
Temporal patterns in price data are captured using 1D convolutions.
class TradingCNN(nn.Module):
"""1D CNN 기반 트레이딩 모델"""
def __init__(self, window_size, n_features, n_actions):
super().__init__()
self.conv = nn.Sequential(
nn.Conv1d(n_features, 32, kernel_size=5, padding=2), nn.ReLU(),
nn.Conv1d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
nn.AdaptiveAvgPool1d(1),
)
self.fc = nn.Sequential(
nn.Linear(64 + 3, 64), nn.ReLU(),
nn.Linear(64, n_actions),
)
self.window_size = window_size
self.n_features = n_features
def forward(self, x):
batch_size = x.shape[0]
window_data = x[:, :-3].view(batch_size, self.window_size, self.n_features)
position_info = x[:, -3:]
window_data = window_data.transpose(1, 2)
conv_out = self.conv(window_data).squeeze(-1)
combined = torch.cat([conv_out, position_info], dim=1)
return self.fc(combined)
Training the Trading Agent
class ReplayBuffer:
def __init__(self, capacity):
self.buffer = deque(maxlen=capacity)
def push(self, state, action, reward, next_state, done):
self.buffer.append((state, action, reward, next_state, done))
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
s, a, r, ns, d = zip(*batch)
return (np.array(s), np.array(a), np.array(r, dtype=np.float32),
np.array(ns), np.array(d, dtype=np.bool_))
def __len__(self):
return len(self.buffer)
def train_trading_agent(env, model_type="ff", n_episodes=500):
"""트레이딩 에이전트 학습"""
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
device = torch.device("cpu")
if model_type == "cnn":
online_net = TradingCNN(env.window_size, env.n_features, n_actions).to(device)
target_net = TradingCNN(env.window_size, env.n_features, n_actions).to(device)
else:
online_net = TradingDQN(obs_size, n_actions).to(device)
target_net = TradingDQN(obs_size, n_actions).to(device)
target_net.load_state_dict(online_net.state_dict())
optimizer = optim.Adam(online_net.parameters(), lr=1e-4)
buffer = ReplayBuffer(50000)
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
gamma = 0.99
batch_size = 64
target_update = 50
best_return = -float('inf')
returns_history = []
for episode in range(n_episodes):
obs, _ = env.reset()
total_reward = 0
while True:
if random.random() < epsilon:
action = env.action_space.sample()
else:
with torch.no_grad():
q = online_net(torch.tensor([obs], dtype=torch.float32).to(device))
action = q.argmax(dim=1).item()
next_obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
buffer.push(obs, action, reward, next_obs, done)
total_reward += reward
obs = next_obs
if len(buffer) >= batch_size:
s, a, r, ns, d = buffer.sample(batch_size)
s_t = torch.tensor(s, dtype=torch.float32).to(device)
a_t = torch.tensor(a, dtype=torch.long).to(device)
r_t = torch.tensor(r, dtype=torch.float32).to(device)
ns_t = torch.tensor(ns, dtype=torch.float32).to(device)
d_t = torch.tensor(d, dtype=torch.bool).to(device)
current_q = online_net(s_t).gather(1, a_t.unsqueeze(1)).squeeze(1)
with torch.no_grad():
best_a = online_net(ns_t).argmax(dim=1)
next_q = target_net(ns_t).gather(1, best_a.unsqueeze(1)).squeeze(1)
next_q[d_t] = 0.0
target_q = r_t + gamma * next_q
loss = nn.SmoothL1Loss()(current_q, target_q)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(online_net.parameters(), 1.0)
optimizer.step()
if done:
break
if episode % target_update == 0:
target_net.load_state_dict(online_net.state_dict())
epsilon = max(epsilon_min, epsilon * epsilon_decay)
episode_return = (env.total_value - env.initial_balance) / env.initial_balance
returns_history.append(episode_return)
if episode_return > best_return:
best_return = episode_return
torch.save(online_net.state_dict(), "best_trading_model.pth")
if episode % 50 == 0:
mean_return = np.mean(returns_history[-50:])
print(f"에피소드 {episode}: 수익률={episode_return:.2%}, 평균 수익률={mean_return:.2%}, 거래={info['total_trades']}, 엡실론={epsilon:.3f}")
return online_net, returns_history
# trained_model, history = train_trading_agent(env, model_type="ff", n_episodes=500)
Agent Evaluation and Analysis
def backtest_agent(env, net, n_episodes=5):
"""학습된 에이전트 백테스트"""
all_results = []
for episode in range(n_episodes):
obs, _ = env.reset()
portfolio_values = [env.total_value]
while True:
with torch.no_grad():
q = net(torch.tensor([obs], dtype=torch.float32))
action = q.argmax(dim=1).item()
obs, reward, terminated, truncated, info = env.step(action)
portfolio_values.append(env.total_value)
if terminated or truncated:
break
total_return = (portfolio_values[-1] - portfolio_values[0]) / portfolio_values[0]
daily_returns = np.diff(portfolio_values) / portfolio_values[:-1]
sharpe_ratio = np.mean(daily_returns) / (np.std(daily_returns) + 1e-10) * np.sqrt(252)
peak = np.maximum.accumulate(portfolio_values)
drawdown = (np.array(portfolio_values) - peak) / peak
max_drawdown = drawdown.min()
result = {'total_return': total_return, 'sharpe_ratio': sharpe_ratio, 'max_drawdown': max_drawdown, 'total_trades': info['total_trades']}
all_results.append(result)
print(f"\n에피소드 {episode + 1}: 총 수익률: {total_return:.2%}, 샤프 비율: {sharpe_ratio:.2f}, 최대 낙폭: {max_drawdown:.2%}")
avg_return = np.mean([r['total_return'] for r in all_results])
print(f"\n=== 전체 백테스트 결과 === 평균 수익률: {avg_return:.2%}")
return all_results
# backtest_results = backtest_agent(env, trained_model)
Summary
- Problem definition: Model stock trading as an MDP (state=market data, action=buy/sell/hold, reward=profit)
- Data preparation: Add technical indicators to price data to construct observation space
- Environment design: Implement custom trading environment with Gymnasium interface
- Models: Both feedforward DQN and 1D CNN models can be used
- Reward design: Consider various factors beyond simple returns, such as risk adjustment and trading penalties
- Evaluation: Evaluate with diverse performance metrics including returns, Sharpe ratio, and maximum drawdown
In the next article, we will move beyond value-based methods to explore Policy Gradient methods that directly optimize the policy.