Tool Calling 실전 가이드: AI가 외부 세계를 다루는 방법과 흔한 함정들

Tool Calling이 Agent의 핵심인 이유
OpenAI Function Calling 완전 구현
- 기본 구조
- 완전한 Tool Call 루프
Anthropic Claude의 Tool Use
흔한 실수와 해결법 (실전 경험)
Parallel Tool Calls (병렬 실행)
Tool 설계 원칙
프로덕션 체크리스트
마치며

LLM만으로는 텍스트를 변환할 뿐입니다. 툴이 붙으면 무언가를 할 수 있게 됩니다. Tool Calling은 LLM을 수동적인 텍스트 생성기에서 능동적인 에이전트로 바꾸는 메커니즘입니다.

이 글은 Tool Calling이 실제로 어떻게 동작하는지, 완성도 높은 구현 방법, 그리고 현업에서 반드시 부딪히는 함정들을 다룹니다.

Tool Calling이 Agent의 핵심인 이유

에이전트의 능력은 사용할 수 있는 툴의 범위와 같습니다:

검색 툴 → 실시간 정보에 접근
계산기 툴 → 수학적 정확도 보장
코드 실행 툴 → 실제 코드 작성 및 실행
API 호출 툴 → 외부 서비스와 통합
데이터베이스 툴 → 데이터 읽기/쓰기

툴이 없으면 LLM은 학습 데이터 안에서만 대답합니다. 툴이 있으면 실시간 주가를 조회하고, 이메일을 보내고, 코드를 실행할 수 있습니다.

OpenAI Function Calling 완전 구현

현재 사실상 표준이 된 OpenAI의 function calling 방식입니다.

기본 구조

import openai
import json
from typing import Any

client = openai.OpenAI(api_key="your-api-key")

# 1단계: 툴 정의
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": (
                "현재 날씨를 조회합니다. 사용자가 특정 도시의 날씨를 물어볼 때 사용하세요. "
                "과거 날씨나 예보에는 사용하지 마세요."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "도시명 (영어). 예: 'Seoul', 'Tokyo', 'New York'"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "온도 단위. 기본값: celsius"
                    }
                },
                "required": ["city"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": (
                "웹에서 정보를 검색합니다. 최신 정보, 뉴스, "
                "학습 데이터에 없을 수 있는 정보를 찾을 때 사용하세요."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "검색 쿼리"
                    },
                    "num_results": {
                        "type": "integer",
                        "description": "반환할 결과 수 (1-10)",
                        "default": 3
                    }
                },
                "required": ["query"]
            }
        }
    }
]

# 2단계: 실제 함수 구현
def get_weather(city: str, unit: str = "celsius") -> dict:
    # 실제 날씨 API 호출
    response = weather_api.get(city=city, unit=unit)
    return {
        "city": city,
        "temperature": response.temp,
        "unit": unit,
        "condition": response.condition,
        "humidity": response.humidity
    }

def search_web(query: str, num_results: int = 3) -> list:
    # 실제 검색 API 호출
    results = search_api.search(query, count=num_results)
    return [{"title": r.title, "snippet": r.snippet, "url": r.url} for r in results]

# 함수 이름 → 함수 객체 매핑
available_tools = {
    "get_weather": get_weather,
    "search_web": search_web
}

완전한 Tool Call 루프

def run_agent(user_message: str, max_iterations: int = 10) -> str:
    messages = [{"role": "user", "content": user_message}]

    for iteration in range(max_iterations):
        # LLM 호출
        response = client.chat.completions.create(
            model="gpt-4",
            messages=messages,
            tools=tools,
            tool_choice="auto"  # LLM이 자동으로 툴 사용 여부 결정
        )

        assistant_message = response.choices[0].message
        messages.append(assistant_message)

        # 툴 호출이 없으면 → 최종 답변
        if not assistant_message.tool_calls:
            return assistant_message.content

        # 툴 호출이 있으면 → 각 툴 실행
        for tool_call in assistant_message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)

            print(f"Calling tool: {function_name}({function_args})")

            # 툴 실행
            if function_name in available_tools:
                try:
                    result = available_tools[function_name](**function_args)
                    tool_result = json.dumps(result, ensure_ascii=False)
                except Exception as e:
                    tool_result = f"Error: {str(e)}. Please try a different approach."
            else:
                tool_result = f"Unknown tool: {function_name}"

            # 툴 결과를 메시지에 추가
            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": tool_result
            })

    return "Max iterations reached"

# 사용 예시
result = run_agent("서울 날씨 어때? 그리고 오늘 AI 뉴스도 찾아줘")
print(result)

Anthropic Claude의 Tool Use

Claude는 OpenAI와 다른 형식이지만 개념은 동일합니다:

import anthropic
import json

client = anthropic.Anthropic(api_key="your-api-key")

tools = [
    {
        "name": "get_weather",
        "description": "현재 날씨를 조회합니다",
        "input_schema": {
            "type": "object",
            "properties": {
                "city": {"type": "string", "description": "도시명"}
            },
            "required": ["city"]
        }
    }
]

def run_claude_agent(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    while True:
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            tools=tools,
            messages=messages
        )

        # 응답을 메시지 히스토리에 추가
        messages.append({"role": "assistant", "content": response.content})

        # stop_reason이 end_turn이면 완료
        if response.stop_reason == "end_turn":
            # 텍스트 블록만 추출
            return " ".join(
                block.text for block in response.content
                if hasattr(block, "text")
            )

        # tool_use 블록 처리
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                try:
                    result = available_tools[block.name](**block.input)
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": json.dumps(result)
                    })
                except Exception as e:
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": f"Error: {str(e)}",
                        "is_error": True
                    })

        messages.append({"role": "user", "content": tool_results})

흔한 실수와 해결법 (실전 경험)

이 섹션이 이 글에서 가장 중요합니다. 실제로 프로덕션에서 마주치는 문제들입니다.

실수 1: Tool Description이 나쁘면 LLM이 잘못 부름

# 나쁜 예: 너무 모호함
{
    "name": "query",
    "description": "Query data"
}

# 좋은 예: 언제 쓰는지, 무엇을 반환하는지, 무엇에는 안 쓰는지 명시
{
    "name": "query_customer_orders",
    "description": (
        "특정 고객의 주문 내역을 조회합니다. "
        "customer_id가 필요합니다. "
        "반환: 주문 ID, 날짜, 금액, 상태 목록. "
        "고객 정보 조회에는 get_customer_info를 사용하세요."
    )
}

실수 2: 에러 핸들링 없음

# 나쁜 예: 에러가 터지면 전체 에이전트가 죽음
result = database.query(sql)

# 좋은 예: 에러를 LLM에게 반환 → LLM이 다른 접근법 시도
try:
    result = database.query(sql)
    return json.dumps(result)
except DatabaseError as e:
    return f"Database error: {str(e)}. The query may have a syntax error."
except TimeoutError:
    return "Query timed out. Try a simpler query or add more specific filters."
except Exception as e:
    return f"Unexpected error: {str(e)}. Please try a different approach."

실수 3: 무한 루프

# 반드시 루프 감지 추가
MAX_ITERATIONS = 15
call_count = {}

for iteration in range(MAX_ITERATIONS):
    # ...
    for tool_call in tool_calls:
        name = tool_call.function.name
        args_str = tool_call.function.arguments

        call_key = f"{name}:{args_str}"
        call_count[call_key] = call_count.get(call_key, 0) + 1

        if call_count[call_key] > 3:
            return "Error: Tool called too many times with same args. Breaking loop."

실수 4: 너무 많은 툴 정의

LLM의 성능은 툴이 많아질수록 저하됩니다. 10개 이상이면 관련 툴만 동적으로 선택하는 방식을 고려하세요.

def select_relevant_tools(user_query: str, all_tools: list, max_tools: int = 8) -> list:
    """사용자 쿼리와 관련된 툴만 선택"""
    if len(all_tools) <= max_tools:
        return all_tools

    # 간단한 키워드 매칭으로 관련 툴 필터링
    query_words = set(user_query.lower().split())
    scored_tools = []

    for tool in all_tools:
        desc_words = set(tool["function"]["description"].lower().split())
        overlap = len(query_words & desc_words)
        scored_tools.append((overlap, tool))

    # 상위 max_tools개만 반환
    scored_tools.sort(key=lambda x: x[0], reverse=True)
    return [t for _, t in scored_tools[:max_tools]]

실수 5: 민감한 작업에 확인 없음

삭제, 결제, 이메일 발송 같은 비가역적 작업에는 반드시 Human-in-the-loop를 추가하세요.

# 위험한 작업 목록
HIGH_RISK_TOOLS = {"delete_record", "send_email", "process_payment", "deploy_code"}

def execute_tool_with_approval(tool_name: str, args: dict) -> str:
    if tool_name in HIGH_RISK_TOOLS:
        # 실제 앱에서는 UI 확인 팝업, 슬랙 알림 등을 통해 승인받음
        print(f"HIGH RISK ACTION: {tool_name}({args})")
        approval = input("Approve this action? (yes/no): ")
        if approval.lower() != "yes":
            return "Action cancelled by user"

    return available_tools[tool_name](**args)

Parallel Tool Calls (병렬 실행)

최신 LLM은 여러 툴을 동시에 호출할 수 있습니다. 이를 활용하면 속도가 크게 향상됩니다.

import asyncio
from concurrent.futures import ThreadPoolExecutor

async def execute_tool_calls_parallel(tool_calls: list) -> list:
    """여러 툴 호출을 병렬로 실행"""

    async def execute_single(tool_call):
        name = tool_call.function.name
        args = json.loads(tool_call.function.arguments)

        try:
            # CPU 바운드 작업은 thread pool에서
            loop = asyncio.get_event_loop()
            with ThreadPoolExecutor() as pool:
                result = await loop.run_in_executor(
                    pool,
                    lambda: available_tools[name](**args)
                )
            return {
                "tool_call_id": tool_call.id,
                "content": json.dumps(result)
            }
        except Exception as e:
            return {
                "tool_call_id": tool_call.id,
                "content": f"Error: {str(e)}"
            }

    # 모든 툴을 동시에 실행
    tasks = [execute_single(tc) for tc in tool_calls]
    results = await asyncio.gather(*tasks)
    return results

# 사용 예시
# "서울, 도쿄, 뉴욕 날씨를 동시에 알려줘"
# LLM이 3개의 get_weather 호출을 동시에 반환
# → 순차 실행 시 3× 시간 vs 병렬 실행 시 1× 시간

Tool 설계 원칙

잘 설계된 툴과 그렇지 않은 툴의 차이는 에이전트 성능에 직결됩니다.

원칙 1: Single Responsibility

하나의 툴은 하나의 일만 합니다.

# 나쁜 예: 너무 많은 역할
def manage_customer(action: str, customer_id: int, **kwargs):
    if action == "get": ...
    elif action == "update": ...
    elif action == "delete": ...

# 좋은 예: 각각 분리
def get_customer(customer_id: int) -> dict: ...
def update_customer(customer_id: int, name: str = None, email: str = None) -> dict: ...
def delete_customer(customer_id: int) -> dict: ...

원칙 2: 읽기 vs 쓰기 분리

읽기 툴은 부작용이 없습니다. 쓰기 툴은 상태를 변경합니다. 명확히 분리하세요.

# 읽기 툴: 안전, 여러 번 호출해도 무방
def get_user_balance(user_id: int) -> float: ...
def search_products(query: str) -> list: ...

# 쓰기 툴: 비가역적, 조심해야 함
def transfer_money(from_id: int, to_id: int, amount: float) -> dict: ...
def delete_order(order_id: int) -> dict: ...

원칙 3: 구조화된 데이터 반환

자연어 문장이 아닌 JSON을 반환하세요. LLM이 파싱하고 추론하기 훨씬 쉽습니다.

# 나쁜 예: LLM이 파싱해야 하는 자연어
def get_user_info(user_id: int) -> str:
    return f"John Doe, 30 years old, email: john@example.com, joined 2022"

# 좋은 예: 구조화된 JSON
def get_user_info(user_id: int) -> dict:
    return {
        "id": user_id,
        "name": "John Doe",
        "age": 30,
        "email": "john@example.com",
        "joined_at": "2022-01-15"
    }

원칙 4: 에러 정보를 결과에 포함

def safe_tool_wrapper(func):
    """툴 함수를 에러 처리와 함께 래핑"""
    def wrapper(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
            return {"success": True, "data": result}
        except ValueError as e:
            return {"success": False, "error": "invalid_input", "message": str(e)}
        except PermissionError as e:
            return {"success": False, "error": "permission_denied", "message": str(e)}
        except Exception as e:
            return {"success": False, "error": "unexpected", "message": str(e)}
    return wrapper

@safe_tool_wrapper
def create_order(product_id: int, quantity: int, user_id: int) -> dict:
    # 주문 생성 로직
    ...

프로덕션 체크리스트

Tool Calling을 프로덕션에 배포할 때 확인해야 할 항목들:

class ToolCallMiddleware:
    """프로덕션용 툴 호출 미들웨어"""

    def __init__(self, tools: dict, max_iterations: int = 10):
        self.tools = tools
        self.max_iterations = max_iterations
        self.call_log = []  # 모든 호출 기록

    async def execute(self, tool_name: str, args: dict, timeout: int = 30) -> dict:
        start_time = time.time()

        # 1. 툴 존재 확인
        if tool_name not in self.tools:
            return {"error": f"Unknown tool: {tool_name}"}

        # 2. 입력 검증 (중요!)
        try:
            validated_args = self._validate_args(tool_name, args)
        except ValueError as e:
            return {"error": f"Invalid arguments: {str(e)}"}

        # 3. 타임아웃과 함께 실행
        try:
            result = await asyncio.wait_for(
                asyncio.to_thread(self.tools[tool_name], **validated_args),
                timeout=timeout
            )
        except asyncio.TimeoutError:
            result = {"error": f"Timed out after {timeout}s"}
        except Exception as e:
            result = {"error": str(e)}

        # 4. 실행 로그 기록
        elapsed = time.time() - start_time
        self.call_log.append({
            "tool": tool_name,
            "args": args,
            "elapsed": elapsed,
            "success": "error" not in result,
            "timestamp": time.time()
        })

        return result

마치며

Tool Calling은 LLM을 에이전트로 만드는 핵심 기술입니다. 기본 구현은 어렵지 않지만, 잘 동작하는 Tool Calling을 만드는 것은 디테일의 싸움입니다.

가장 중요한 것 세 가지: 명확한 툴 설명, 에러 처리, 루프 방지. 이 세 가지만 잘 해도 대부분의 문제를 피할 수 있습니다.

이 시리즈를 통해 에이전트 설계 패턴(Post 5), MCP(Post 6), Multi-Agent 시스템(Post 7), 그리고 Tool Calling(Post 8)을 다뤘습니다. 이제 실제로 무언가를 만들어 보세요 — 이론보다 구현에서 배우는 게 훨씬 빠릅니다.