Split View: [심층 강화학습] 13. 웹 내비게이션과 강화학습
[심층 강화학습] 13. 웹 내비게이션과 강화학습
개요
웹 브라우저를 조작하여 정보를 찾고, 양식을 작성하고, 버튼을 클릭하는 작업은 인간에게는 자연스럽지만 기계에게는 매우 어렵다. 웹 페이지는 동적이고 다양한 레이아웃을 가지며, 같은 작업이라도 사이트마다 다른 방식으로 수행해야 한다.
웹 내비게이션에 강화학습을 적용하면, 에이전트가 웹 페이지의 시각적 정보나 DOM 구조를 관찰하여 적절한 행동(클릭, 타이핑 등)을 학습할 수 있다.
웹 내비게이션의 도전 과제
왜 어려운가
웹 환경은 전통적인 RL 환경과 크게 다르다:
- 거대한 상태 공간: 웹 페이지의 렌더링 결과(픽셀)는 수백만 차원이다
- 거대한 행동 공간: 마우스 위치(x, y) + 클릭/드래그 + 키보드 입력의 조합
- 지연 보상: 여러 단계의 클릭과 입력 후에야 작업 완료를 확인할 수 있다
- 부분 관측성: 스크롤해야 보이는 요소, 팝업, 동적 로딩 등
- 환경 비결정성: 같은 페이지도 로딩 시간에 따라 다른 상태를 보여준다
상태 표현 방식
| 방식 | 설명 | 장점 | 단점 |
|---|---|---|---|
| 픽셀 기반 | 스크린샷을 직접 입력으로 사용 | 범용적, 시각 정보 포함 | 고차원, 학습 느림 |
| DOM 기반 | HTML DOM 트리를 파싱하여 사용 | 구조 정보 풍부 | 사이트마다 구조 다름 |
| 혼합 방식 | 픽셀 + DOM 정보 결합 | 가장 풍부한 정보 | 복잡한 전처리 필요 |
브라우저 자동화와 강화학습
환경 인터페이스
웹 브라우저를 Gym 호환 환경으로 래핑한다:
import numpy as np
class WebEnvironment:
"""Gym 호환 웹 브라우저 환경"""
def __init__(self, task_config, screen_width=160, screen_height=210):
self.screen_width = screen_width
self.screen_height = screen_height
self.task = task_config
# 행동 공간: 그리드 클릭 + 키보드 입력
self.grid_size = 16 # 16x16 그리드
self.num_click_actions = self.grid_size * self.grid_size
self.num_type_actions = 128 # ASCII 문자
self.total_actions = self.num_click_actions + self.num_type_actions
def reset(self):
"""새 에피소드 시작: 웹 페이지 로드"""
self._load_page(self.task['url'])
screenshot = self._get_screenshot()
return self._preprocess(screenshot)
def step(self, action):
"""행동 실행 및 결과 반환"""
if action < self.num_click_actions:
# 클릭 행동: 그리드 좌표로 변환
row = action // self.grid_size
col = action % self.grid_size
x = col * (self.screen_width // self.grid_size)
y = row * (self.screen_height // self.grid_size)
self._click(x, y)
else:
# 타이핑 행동
char_idx = action - self.num_click_actions
self._type_char(chr(char_idx))
screenshot = self._get_screenshot()
obs = self._preprocess(screenshot)
reward = self._compute_reward()
done = self._check_done()
return obs, reward, done, {}
def _preprocess(self, screenshot):
"""스크린샷을 모델 입력으로 변환"""
# 리사이즈 및 정규화
resized = np.array(screenshot.resize((self.screen_width,
self.screen_height)))
return resized.astype(np.float32) / 255.0
Mini World of Bits 벤치마크
OpenAI와 연구진이 개발한 Mini World of Bits(MiniWoB)는 웹 내비게이션 RL의 표준 벤치마크이다. 간단한 HTML 위젯에서 특정 작업을 수행하는 것이 목표이다.
작업 예시
- click-button: 특정 텍스트가 적힌 버튼을 클릭
- click-checkboxes: 지정된 체크박스들을 선택
- enter-text: 텍스트 필드에 지정된 문자열 입력
- navigate-tree: 트리 구조 메뉴를 탐색하여 특정 항목 선택
- email-inbox: 이메일 목록에서 특정 조건의 이메일을 찾아 작업 수행
각 작업은 160x210 픽셀의 소형 웹 페이지로, 에이전트가 10초 이내에 완료해야 한다.
class MiniWoBTask:
"""MiniWoB 작업 정의"""
def __init__(self, task_name):
self.task_name = task_name
self.time_limit = 10.0 # 10초
self.reward_range = (-1.0, 1.0)
def get_reward(self, page_state, time_elapsed):
"""작업 완료 여부에 따른 보상"""
if self._is_task_complete(page_state):
# 빠를수록 높은 보상
time_bonus = 1.0 - (time_elapsed / self.time_limit)
return max(time_bonus, 0.1)
elif time_elapsed >= self.time_limit:
return -1.0 # 시간 초과 페널티
else:
return 0.0 # 진행 중
OpenAI Universe
OpenAI Universe는 다양한 환경(게임, 웹 브라우저 등)을 표준 인터페이스로 제공하는 플랫폼이었다. VNC(Virtual Network Computing)를 통해 브라우저 화면을 관찰하고, 마우스/키보드 이벤트를 전송한다.
VNC 기반 환경의 특징
- 실제 브라우저(Chrome, Firefox)를 Docker 컨테이너에서 실행
- 에이전트는 VNC 프로토콜로 화면 픽셀을 수신
- 행동은 마우스 좌표와 키보드 이벤트로 전송
- 네트워크 지연과 렌더링 지연이 존재
class VNCActionSpace:
"""VNC 기반 행동 공간"""
def __init__(self, screen_width, screen_height):
self.screen_width = screen_width
self.screen_height = screen_height
def click(self, x, y):
"""마우스 클릭 이벤트 생성"""
return {
'type': 'pointer',
'x': int(x),
'y': int(y),
'button': 1, # 좌클릭
}
def type_text(self, text):
"""키보드 입력 이벤트 생성"""
events = []
for char in text:
events.append({
'type': 'key',
'key': char,
'action': 'press',
})
return events
def scroll(self, x, y, direction='down'):
"""스크롤 이벤트 생성"""
delta = -3 if direction == 'down' else 3
return {
'type': 'scroll',
'x': int(x),
'y': int(y),
'delta': delta,
}
단순 클릭 접근법
가장 기본적인 웹 내비게이션 에이전트는 화면을 그리드로 분할하고, 어느 셀을 클릭할지 결정하는 분류 문제로 변환한다.
그리드 행동 공간
class GridActionSpace:
"""화면을 NxN 그리드로 분할하여 클릭 위치 결정"""
def __init__(self, screen_w, screen_h, grid_n=16):
self.screen_w = screen_w
self.screen_h = screen_h
self.grid_n = grid_n
self.cell_w = screen_w / grid_n
self.cell_h = screen_h / grid_n
self.n_actions = grid_n * grid_n
def action_to_coordinate(self, action_idx):
"""행동 인덱스를 화면 좌표로 변환"""
row = action_idx // self.grid_n
col = action_idx % self.grid_n
# 셀 중앙 좌표
x = (col + 0.5) * self.cell_w
y = (row + 0.5) * self.cell_h
return int(x), int(y)
def coordinate_to_action(self, x, y):
"""화면 좌표를 행동 인덱스로 변환"""
col = min(int(x / self.cell_w), self.grid_n - 1)
row = min(int(y / self.cell_h), self.grid_n - 1)
return row * self.grid_n + col
CNN 기반 모델
스크린샷을 입력으로 받아 그리드 셀의 클릭 확률을 출력하는 모델:
import torch
import torch.nn as nn
class WebNavigationModel(nn.Module):
"""CNN 기반 웹 내비게이션 에이전트"""
def __init__(self, grid_size=16):
super().__init__()
self.grid_size = grid_size
n_actions = grid_size * grid_size
# CNN으로 화면 특징 추출
self.conv = nn.Sequential(
nn.Conv2d(3, 32, 8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, 3, stride=1),
nn.ReLU(),
)
# 특징 벡터 크기 계산
self._feature_size = self._get_conv_output_size((3, 210, 160))
# Actor: 클릭 위치 정책
self.policy = nn.Sequential(
nn.Linear(self._feature_size, 512),
nn.ReLU(),
nn.Linear(512, n_actions),
)
# Critic: 상태 가치
self.value = nn.Sequential(
nn.Linear(self._feature_size, 512),
nn.ReLU(),
nn.Linear(512, 1),
)
def _get_conv_output_size(self, shape):
with torch.no_grad():
dummy = torch.zeros(1, *shape)
return self.conv(dummy).view(1, -1).shape[1]
def forward(self, screen):
features = self.conv(screen).view(screen.size(0), -1)
policy_logits = self.policy(features)
value = self.value(features)
return policy_logits, value
학습 루프
def train_web_agent(model, env, num_episodes=10000, gamma=0.99):
"""A2C로 웹 내비게이션 에이전트 학습"""
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for episode in range(num_episodes):
state = env.reset()
done = False
episode_reward = 0
log_probs = []
values = []
rewards = []
while not done:
state_t = torch.FloatTensor(state).permute(2, 0, 1).unsqueeze(0)
logits, value = model(state_t)
probs = torch.softmax(logits, dim=-1)
dist = torch.distributions.Categorical(probs)
action = dist.sample()
next_state, reward, done, info = env.step(action.item())
log_probs.append(dist.log_prob(action))
values.append(value.squeeze())
rewards.append(reward)
state = next_state
episode_reward += reward
# A2C 업데이트
returns = compute_returns(rewards, gamma)
returns_t = torch.FloatTensor(returns)
values_t = torch.stack(values)
log_probs_t = torch.stack(log_probs)
advantages = returns_t - values_t.detach()
policy_loss = -(log_probs_t * advantages).mean()
value_loss = (returns_t - values_t).pow(2).mean()
loss = policy_loss + 0.5 * value_loss
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 40.0)
optimizer.step()
if episode % 100 == 0:
print(f"Episode {episode}: Reward={episode_reward:.2f}")
def compute_returns(rewards, gamma):
returns = []
R = 0
for r in reversed(rewards):
R = r + gamma * R
returns.insert(0, R)
return returns
인간 시연을 활용한 학습
순수 RL만으로는 웹 내비게이션 학습이 매우 느릴 수 있다. 인간 시연(human demonstrations) 을 활용하면 학습을 크게 가속할 수 있다.
행동 복제 (Behavioral Cloning)
먼저 인간의 행동을 모방하는 지도 학습을 수행한 뒤, RL로 미세조정한다:
def pretrain_with_demonstrations(model, demos, num_epochs=50):
"""인간 시연 데이터로 사전 학습"""
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
total_loss = 0
correct = 0
total = 0
for screen, action in demos:
screen_t = torch.FloatTensor(screen).permute(2, 0, 1).unsqueeze(0)
action_t = torch.tensor([action], dtype=torch.long)
logits, _ = model(screen_t)
loss = criterion(logits, action_t)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
predicted = logits.argmax(dim=-1)
correct += (predicted == action_t).sum().item()
total += 1
accuracy = correct / total
print(f"Epoch {epoch}: Loss={total_loss/len(demos):.4f}, "
f"Accuracy={accuracy:.2%}")
return model
시연 가중 학습
인간 시연의 경험을 replay buffer에 넣고 높은 우선순위를 부여하여, RL 학습 중에도 지속적으로 참조한다:
class DemonstrationReplayBuffer:
"""시연 데이터를 포함하는 우선순위 리플레이 버퍼"""
def __init__(self, capacity, demo_ratio=0.25):
self.capacity = capacity
self.demo_ratio = demo_ratio
self.demo_buffer = []
self.agent_buffer = []
def add_demonstration(self, transition):
self.demo_buffer.append(transition)
def add_agent_experience(self, transition):
if len(self.agent_buffer) >= self.capacity:
self.agent_buffer.pop(0)
self.agent_buffer.append(transition)
def sample(self, batch_size):
"""시연과 에이전트 경험을 혼합하여 샘플링"""
n_demo = int(batch_size * self.demo_ratio)
n_agent = batch_size - n_demo
demo_samples = random.sample(
self.demo_buffer,
min(n_demo, len(self.demo_buffer))
)
agent_samples = random.sample(
self.agent_buffer,
min(n_agent, len(self.agent_buffer))
)
return demo_samples + agent_samples
실전 고려사항과 한계
현재 기술의 한계
- MiniWoB 벤치마크에서도 복잡한 작업(email-inbox, social-media 등)은 인간 수준에 미치지 못한다
- 실제 웹사이트의 다양성과 복잡성을 처리하기 어렵다
- DOM 구조의 동적 변화에 취약하다
최신 연구 방향
- 대규모 언어모델(LLM) 기반 웹 에이전트: HTML을 텍스트로 입력하여 행동을 생성
- 멀티모달 모델: 화면 이미지와 텍스트를 함께 이해하는 에이전트
- 계층적 RL: 고수준 계획(어떤 요소와 상호작용할지)과 저수준 실행(정확한 좌표 클릭)을 분리
핵심 요약
- 웹 내비게이션은 거대한 상태/행동 공간과 지연 보상이 특징인 어려운 RL 문제이다
- 그리드 기반 행동 공간으로 단순화하여 CNN + A2C로 기본 에이전트를 구현할 수 있다
- 인간 시연(행동 복제)으로 사전 학습하면 RL 학습이 크게 가속된다
- 최신 연구는 LLM과 멀티모달 모델을 활용한 범용 웹 에이전트로 향하고 있다
다음 글에서는 연속 행동 공간을 다루는 DDPG와 분포 정책 그래디언트를 알아보겠다.
[Deep RL] 13. Web Navigation and Reinforcement Learning
Overview
Navigating web browsers to find information, fill out forms, and click buttons is natural for humans but extremely challenging for machines. Web pages are dynamic with diverse layouts, and the same task must be performed differently across different websites.
By applying reinforcement learning to web navigation, agents can learn appropriate actions (clicking, typing, etc.) by observing the visual information or DOM structure of web pages.
Challenges of Web Navigation
Why Is It Difficult?
Web environments differ significantly from traditional RL environments:
- Enormous state space: The rendered result (pixels) of a web page has millions of dimensions
- Enormous action space: Combinations of mouse position (x, y) + click/drag + keyboard input
- Delayed reward: Task completion can only be confirmed after multiple clicks and inputs
- Partial observability: Elements visible only after scrolling, popups, dynamic loading, etc.
- Environment non-determinism: The same page can show different states depending on loading time
State Representation Methods
| Method | Description | Advantages | Disadvantages |
|---|---|---|---|
| Pixel-based | Use screenshots directly as input | General purpose, includes visual info | High-dimensional, slow learning |
| DOM-based | Parse HTML DOM tree for use | Rich structural information | Different structure per site |
| Hybrid | Combine pixel + DOM information | Richest information | Complex preprocessing needed |
Browser Automation and Reinforcement Learning
Environment Interface
We wrap the web browser as a Gym-compatible environment:
import numpy as np
class WebEnvironment:
"""Gym-compatible web browser environment"""
def __init__(self, task_config, screen_width=160, screen_height=210):
self.screen_width = screen_width
self.screen_height = screen_height
self.task = task_config
# Action space: grid clicks + keyboard input
self.grid_size = 16 # 16x16 grid
self.num_click_actions = self.grid_size * self.grid_size
self.num_type_actions = 128 # ASCII characters
self.total_actions = self.num_click_actions + self.num_type_actions
def reset(self):
"""Start new episode: load web page"""
self._load_page(self.task['url'])
screenshot = self._get_screenshot()
return self._preprocess(screenshot)
def step(self, action):
"""Execute action and return result"""
if action < self.num_click_actions:
# Click action: convert to grid coordinates
row = action // self.grid_size
col = action % self.grid_size
x = col * (self.screen_width // self.grid_size)
y = row * (self.screen_height // self.grid_size)
self._click(x, y)
else:
# Typing action
char_idx = action - self.num_click_actions
self._type_char(chr(char_idx))
screenshot = self._get_screenshot()
obs = self._preprocess(screenshot)
reward = self._compute_reward()
done = self._check_done()
return obs, reward, done, {}
def _preprocess(self, screenshot):
"""Convert screenshot to model input"""
# Resize and normalize
resized = np.array(screenshot.resize((self.screen_width,
self.screen_height)))
return resized.astype(np.float32) / 255.0
Mini World of Bits Benchmark
Mini World of Bits (MiniWoB), developed by OpenAI and researchers, is the standard benchmark for web navigation RL. The goal is to perform specific tasks on simple HTML widgets.
Task Examples
- click-button: Click a button with specific text
- click-checkboxes: Select designated checkboxes
- enter-text: Enter a specified string in a text field
- navigate-tree: Navigate a tree structure menu to select a specific item
- email-inbox: Find and perform actions on emails matching specific conditions
Each task is a small 160x210 pixel web page that the agent must complete within 10 seconds.
class MiniWoBTask:
"""MiniWoB task definition"""
def __init__(self, task_name):
self.task_name = task_name
self.time_limit = 10.0 # 10 seconds
self.reward_range = (-1.0, 1.0)
def get_reward(self, page_state, time_elapsed):
"""Reward based on task completion"""
if self._is_task_complete(page_state):
# Higher reward for faster completion
time_bonus = 1.0 - (time_elapsed / self.time_limit)
return max(time_bonus, 0.1)
elif time_elapsed >= self.time_limit:
return -1.0 # Timeout penalty
else:
return 0.0 # In progress
OpenAI Universe
OpenAI Universe was a platform that provided various environments (games, web browsers, etc.) through a standard interface. It observed browser screens via VNC (Virtual Network Computing) and sent mouse/keyboard events.
Features of VNC-Based Environments
- Runs actual browsers (Chrome, Firefox) in Docker containers
- The agent receives screen pixels via VNC protocol
- Actions are sent as mouse coordinates and keyboard events
- Network latency and rendering delays exist
class VNCActionSpace:
"""VNC-based action space"""
def __init__(self, screen_width, screen_height):
self.screen_width = screen_width
self.screen_height = screen_height
def click(self, x, y):
"""Generate mouse click event"""
return {
'type': 'pointer',
'x': int(x),
'y': int(y),
'button': 1, # Left click
}
def type_text(self, text):
"""Generate keyboard input events"""
events = []
for char in text:
events.append({
'type': 'key',
'key': char,
'action': 'press',
})
return events
def scroll(self, x, y, direction='down'):
"""Generate scroll event"""
delta = -3 if direction == 'down' else 3
return {
'type': 'scroll',
'x': int(x),
'y': int(y),
'delta': delta,
}
Simple Click Approach
The most basic web navigation agent transforms the problem into a classification task by dividing the screen into a grid and deciding which cell to click.
Grid Action Space
class GridActionSpace:
"""Divide screen into NxN grid to determine click position"""
def __init__(self, screen_w, screen_h, grid_n=16):
self.screen_w = screen_w
self.screen_h = screen_h
self.grid_n = grid_n
self.cell_w = screen_w / grid_n
self.cell_h = screen_h / grid_n
self.n_actions = grid_n * grid_n
def action_to_coordinate(self, action_idx):
"""Convert action index to screen coordinates"""
row = action_idx // self.grid_n
col = action_idx % self.grid_n
# Cell center coordinates
x = (col + 0.5) * self.cell_w
y = (row + 0.5) * self.cell_h
return int(x), int(y)
def coordinate_to_action(self, x, y):
"""Convert screen coordinates to action index"""
col = min(int(x / self.cell_w), self.grid_n - 1)
row = min(int(y / self.cell_h), self.grid_n - 1)
return row * self.grid_n + col
CNN-Based Model
A model that takes screenshots as input and outputs click probabilities for grid cells:
import torch
import torch.nn as nn
class WebNavigationModel(nn.Module):
"""CNN-based web navigation agent"""
def __init__(self, grid_size=16):
super().__init__()
self.grid_size = grid_size
n_actions = grid_size * grid_size
# CNN for screen feature extraction
self.conv = nn.Sequential(
nn.Conv2d(3, 32, 8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, 3, stride=1),
nn.ReLU(),
)
# Compute feature vector size
self._feature_size = self._get_conv_output_size((3, 210, 160))
# Actor: click position policy
self.policy = nn.Sequential(
nn.Linear(self._feature_size, 512),
nn.ReLU(),
nn.Linear(512, n_actions),
)
# Critic: state value
self.value = nn.Sequential(
nn.Linear(self._feature_size, 512),
nn.ReLU(),
nn.Linear(512, 1),
)
def _get_conv_output_size(self, shape):
with torch.no_grad():
dummy = torch.zeros(1, *shape)
return self.conv(dummy).view(1, -1).shape[1]
def forward(self, screen):
features = self.conv(screen).view(screen.size(0), -1)
policy_logits = self.policy(features)
value = self.value(features)
return policy_logits, value
Training Loop
def train_web_agent(model, env, num_episodes=10000, gamma=0.99):
"""Train web navigation agent with A2C"""
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for episode in range(num_episodes):
state = env.reset()
done = False
episode_reward = 0
log_probs = []
values = []
rewards = []
while not done:
state_t = torch.FloatTensor(state).permute(2, 0, 1).unsqueeze(0)
logits, value = model(state_t)
probs = torch.softmax(logits, dim=-1)
dist = torch.distributions.Categorical(probs)
action = dist.sample()
next_state, reward, done, info = env.step(action.item())
log_probs.append(dist.log_prob(action))
values.append(value.squeeze())
rewards.append(reward)
state = next_state
episode_reward += reward
# A2C update
returns = compute_returns(rewards, gamma)
returns_t = torch.FloatTensor(returns)
values_t = torch.stack(values)
log_probs_t = torch.stack(log_probs)
advantages = returns_t - values_t.detach()
policy_loss = -(log_probs_t * advantages).mean()
value_loss = (returns_t - values_t).pow(2).mean()
loss = policy_loss + 0.5 * value_loss
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 40.0)
optimizer.step()
if episode % 100 == 0:
print(f"Episode {episode}: Reward={episode_reward:.2f}")
def compute_returns(rewards, gamma):
returns = []
R = 0
for r in reversed(rewards):
R = r + gamma * R
returns.insert(0, R)
return returns
Learning from Human Demonstrations
Pure RL alone can be very slow for web navigation learning. Human demonstrations can significantly accelerate learning.
Behavioral Cloning
First perform supervised learning to imitate human behavior, then fine-tune with RL:
def pretrain_with_demonstrations(model, demos, num_epochs=50):
"""Pretrain with human demonstration data"""
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
total_loss = 0
correct = 0
total = 0
for screen, action in demos:
screen_t = torch.FloatTensor(screen).permute(2, 0, 1).unsqueeze(0)
action_t = torch.tensor([action], dtype=torch.long)
logits, _ = model(screen_t)
loss = criterion(logits, action_t)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
predicted = logits.argmax(dim=-1)
correct += (predicted == action_t).sum().item()
total += 1
accuracy = correct / total
print(f"Epoch {epoch}: Loss={total_loss/len(demos):.4f}, "
f"Accuracy={accuracy:.2%}")
return model
Demonstration-Weighted Learning
Human demonstration experiences are placed in a replay buffer with high priority, continuously referenced during RL training:
class DemonstrationReplayBuffer:
"""Priority replay buffer including demonstration data"""
def __init__(self, capacity, demo_ratio=0.25):
self.capacity = capacity
self.demo_ratio = demo_ratio
self.demo_buffer = []
self.agent_buffer = []
def add_demonstration(self, transition):
self.demo_buffer.append(transition)
def add_agent_experience(self, transition):
if len(self.agent_buffer) >= self.capacity:
self.agent_buffer.pop(0)
self.agent_buffer.append(transition)
def sample(self, batch_size):
"""Sample a mix of demonstrations and agent experiences"""
n_demo = int(batch_size * self.demo_ratio)
n_agent = batch_size - n_demo
demo_samples = random.sample(
self.demo_buffer,
min(n_demo, len(self.demo_buffer))
)
agent_samples = random.sample(
self.agent_buffer,
min(n_agent, len(self.agent_buffer))
)
return demo_samples + agent_samples
Practical Considerations and Limitations
Current Technical Limitations
- Even on the MiniWoB benchmark, complex tasks (email-inbox, social-media, etc.) do not reach human-level performance
- It is difficult to handle the diversity and complexity of real websites
- Vulnerable to dynamic changes in DOM structure
Recent Research Directions
- LLM-based web agents: Input HTML as text to generate actions
- Multimodal models: Agents that understand both screen images and text
- Hierarchical RL: Separating high-level planning (which elements to interact with) from low-level execution (precise coordinate clicking)
Key Takeaways
- Web navigation is a challenging RL problem characterized by enormous state/action spaces and delayed rewards
- A basic agent can be implemented with CNN + A2C by simplifying the action space to a grid-based approach
- Pretraining with human demonstrations (behavioral cloning) significantly accelerates RL learning
- Recent research is moving toward general-purpose web agents using LLMs and multimodal models
In the next post, we will explore continuous action spaces with DDPG and distributional policy gradients.