Split View: Tool Calling 실전 가이드: AI가 외부 세계를 다루는 방법과 흔한 함정들

Tool Calling 실전 가이드: AI가 외부 세계를 다루는 방법과 흔한 함정들

Tool Calling이 Agent의 핵심인 이유
OpenAI Function Calling 완전 구현
- 기본 구조
- 완전한 Tool Call 루프
Anthropic Claude의 Tool Use
흔한 실수와 해결법 (실전 경험)
Parallel Tool Calls (병렬 실행)
Tool 설계 원칙
프로덕션 체크리스트
마치며

LLM만으로는 텍스트를 변환할 뿐입니다. 툴이 붙으면 무언가를 할 수 있게 됩니다. Tool Calling은 LLM을 수동적인 텍스트 생성기에서 능동적인 에이전트로 바꾸는 메커니즘입니다.

이 글은 Tool Calling이 실제로 어떻게 동작하는지, 완성도 높은 구현 방법, 그리고 현업에서 반드시 부딪히는 함정들을 다룹니다.

Tool Calling이 Agent의 핵심인 이유

에이전트의 능력은 사용할 수 있는 툴의 범위와 같습니다:

검색 툴 → 실시간 정보에 접근
계산기 툴 → 수학적 정확도 보장
코드 실행 툴 → 실제 코드 작성 및 실행
API 호출 툴 → 외부 서비스와 통합
데이터베이스 툴 → 데이터 읽기/쓰기

툴이 없으면 LLM은 학습 데이터 안에서만 대답합니다. 툴이 있으면 실시간 주가를 조회하고, 이메일을 보내고, 코드를 실행할 수 있습니다.

OpenAI Function Calling 완전 구현

현재 사실상 표준이 된 OpenAI의 function calling 방식입니다.

기본 구조

import openai
import json
from typing import Any

client = openai.OpenAI(api_key="your-api-key")

# 1단계: 툴 정의
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": (
                "현재 날씨를 조회합니다. 사용자가 특정 도시의 날씨를 물어볼 때 사용하세요. "
                "과거 날씨나 예보에는 사용하지 마세요."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "도시명 (영어). 예: 'Seoul', 'Tokyo', 'New York'"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "온도 단위. 기본값: celsius"
                    }
                },
                "required": ["city"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": (
                "웹에서 정보를 검색합니다. 최신 정보, 뉴스, "
                "학습 데이터에 없을 수 있는 정보를 찾을 때 사용하세요."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "검색 쿼리"
                    },
                    "num_results": {
                        "type": "integer",
                        "description": "반환할 결과 수 (1-10)",
                        "default": 3
                    }
                },
                "required": ["query"]
            }
        }
    }
]

# 2단계: 실제 함수 구현
def get_weather(city: str, unit: str = "celsius") -> dict:
    # 실제 날씨 API 호출
    response = weather_api.get(city=city, unit=unit)
    return {
        "city": city,
        "temperature": response.temp,
        "unit": unit,
        "condition": response.condition,
        "humidity": response.humidity
    }

def search_web(query: str, num_results: int = 3) -> list:
    # 실제 검색 API 호출
    results = search_api.search(query, count=num_results)
    return [{"title": r.title, "snippet": r.snippet, "url": r.url} for r in results]

# 함수 이름 → 함수 객체 매핑
available_tools = {
    "get_weather": get_weather,
    "search_web": search_web
}

완전한 Tool Call 루프

def run_agent(user_message: str, max_iterations: int = 10) -> str:
    messages = [{"role": "user", "content": user_message}]

    for iteration in range(max_iterations):
        # LLM 호출
        response = client.chat.completions.create(
            model="gpt-4",
            messages=messages,
            tools=tools,
            tool_choice="auto"  # LLM이 자동으로 툴 사용 여부 결정
        )

        assistant_message = response.choices[0].message
        messages.append(assistant_message)

        # 툴 호출이 없으면 → 최종 답변
        if not assistant_message.tool_calls:
            return assistant_message.content

        # 툴 호출이 있으면 → 각 툴 실행
        for tool_call in assistant_message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)

            print(f"Calling tool: {function_name}({function_args})")

            # 툴 실행
            if function_name in available_tools:
                try:
                    result = available_tools[function_name](**function_args)
                    tool_result = json.dumps(result, ensure_ascii=False)
                except Exception as e:
                    tool_result = f"Error: {str(e)}. Please try a different approach."
            else:
                tool_result = f"Unknown tool: {function_name}"

            # 툴 결과를 메시지에 추가
            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": tool_result
            })

    return "Max iterations reached"

# 사용 예시
result = run_agent("서울 날씨 어때? 그리고 오늘 AI 뉴스도 찾아줘")
print(result)

Anthropic Claude의 Tool Use

Claude는 OpenAI와 다른 형식이지만 개념은 동일합니다:

import anthropic
import json

client = anthropic.Anthropic(api_key="your-api-key")

tools = [
    {
        "name": "get_weather",
        "description": "현재 날씨를 조회합니다",
        "input_schema": {
            "type": "object",
            "properties": {
                "city": {"type": "string", "description": "도시명"}
            },
            "required": ["city"]
        }
    }
]

def run_claude_agent(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    while True:
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            tools=tools,
            messages=messages
        )

        # 응답을 메시지 히스토리에 추가
        messages.append({"role": "assistant", "content": response.content})

        # stop_reason이 end_turn이면 완료
        if response.stop_reason == "end_turn":
            # 텍스트 블록만 추출
            return " ".join(
                block.text for block in response.content
                if hasattr(block, "text")
            )

        # tool_use 블록 처리
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                try:
                    result = available_tools[block.name](**block.input)
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": json.dumps(result)
                    })
                except Exception as e:
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": f"Error: {str(e)}",
                        "is_error": True
                    })

        messages.append({"role": "user", "content": tool_results})

흔한 실수와 해결법 (실전 경험)

이 섹션이 이 글에서 가장 중요합니다. 실제로 프로덕션에서 마주치는 문제들입니다.

실수 1: Tool Description이 나쁘면 LLM이 잘못 부름

# 나쁜 예: 너무 모호함
{
    "name": "query",
    "description": "Query data"
}

# 좋은 예: 언제 쓰는지, 무엇을 반환하는지, 무엇에는 안 쓰는지 명시
{
    "name": "query_customer_orders",
    "description": (
        "특정 고객의 주문 내역을 조회합니다. "
        "customer_id가 필요합니다. "
        "반환: 주문 ID, 날짜, 금액, 상태 목록. "
        "고객 정보 조회에는 get_customer_info를 사용하세요."
    )
}

실수 2: 에러 핸들링 없음

# 나쁜 예: 에러가 터지면 전체 에이전트가 죽음
result = database.query(sql)

# 좋은 예: 에러를 LLM에게 반환 → LLM이 다른 접근법 시도
try:
    result = database.query(sql)
    return json.dumps(result)
except DatabaseError as e:
    return f"Database error: {str(e)}. The query may have a syntax error."
except TimeoutError:
    return "Query timed out. Try a simpler query or add more specific filters."
except Exception as e:
    return f"Unexpected error: {str(e)}. Please try a different approach."

실수 3: 무한 루프

# 반드시 루프 감지 추가
MAX_ITERATIONS = 15
call_count = {}

for iteration in range(MAX_ITERATIONS):
    # ...
    for tool_call in tool_calls:
        name = tool_call.function.name
        args_str = tool_call.function.arguments

        call_key = f"{name}:{args_str}"
        call_count[call_key] = call_count.get(call_key, 0) + 1

        if call_count[call_key] > 3:
            return "Error: Tool called too many times with same args. Breaking loop."

실수 4: 너무 많은 툴 정의

LLM의 성능은 툴이 많아질수록 저하됩니다. 10개 이상이면 관련 툴만 동적으로 선택하는 방식을 고려하세요.

def select_relevant_tools(user_query: str, all_tools: list, max_tools: int = 8) -> list:
    """사용자 쿼리와 관련된 툴만 선택"""
    if len(all_tools) <= max_tools:
        return all_tools

    # 간단한 키워드 매칭으로 관련 툴 필터링
    query_words = set(user_query.lower().split())
    scored_tools = []

    for tool in all_tools:
        desc_words = set(tool["function"]["description"].lower().split())
        overlap = len(query_words & desc_words)
        scored_tools.append((overlap, tool))

    # 상위 max_tools개만 반환
    scored_tools.sort(key=lambda x: x[0], reverse=True)
    return [t for _, t in scored_tools[:max_tools]]

실수 5: 민감한 작업에 확인 없음

삭제, 결제, 이메일 발송 같은 비가역적 작업에는 반드시 Human-in-the-loop를 추가하세요.

# 위험한 작업 목록
HIGH_RISK_TOOLS = {"delete_record", "send_email", "process_payment", "deploy_code"}

def execute_tool_with_approval(tool_name: str, args: dict) -> str:
    if tool_name in HIGH_RISK_TOOLS:
        # 실제 앱에서는 UI 확인 팝업, 슬랙 알림 등을 통해 승인받음
        print(f"HIGH RISK ACTION: {tool_name}({args})")
        approval = input("Approve this action? (yes/no): ")
        if approval.lower() != "yes":
            return "Action cancelled by user"

    return available_tools[tool_name](**args)

Parallel Tool Calls (병렬 실행)

최신 LLM은 여러 툴을 동시에 호출할 수 있습니다. 이를 활용하면 속도가 크게 향상됩니다.

import asyncio
from concurrent.futures import ThreadPoolExecutor

async def execute_tool_calls_parallel(tool_calls: list) -> list:
    """여러 툴 호출을 병렬로 실행"""

    async def execute_single(tool_call):
        name = tool_call.function.name
        args = json.loads(tool_call.function.arguments)

        try:
            # CPU 바운드 작업은 thread pool에서
            loop = asyncio.get_event_loop()
            with ThreadPoolExecutor() as pool:
                result = await loop.run_in_executor(
                    pool,
                    lambda: available_tools[name](**args)
                )
            return {
                "tool_call_id": tool_call.id,
                "content": json.dumps(result)
            }
        except Exception as e:
            return {
                "tool_call_id": tool_call.id,
                "content": f"Error: {str(e)}"
            }

    # 모든 툴을 동시에 실행
    tasks = [execute_single(tc) for tc in tool_calls]
    results = await asyncio.gather(*tasks)
    return results

# 사용 예시
# "서울, 도쿄, 뉴욕 날씨를 동시에 알려줘"
# LLM이 3개의 get_weather 호출을 동시에 반환
# → 순차 실행 시 3× 시간 vs 병렬 실행 시 1× 시간

Tool 설계 원칙

잘 설계된 툴과 그렇지 않은 툴의 차이는 에이전트 성능에 직결됩니다.

원칙 1: Single Responsibility

하나의 툴은 하나의 일만 합니다.

# 나쁜 예: 너무 많은 역할
def manage_customer(action: str, customer_id: int, **kwargs):
    if action == "get": ...
    elif action == "update": ...
    elif action == "delete": ...

# 좋은 예: 각각 분리
def get_customer(customer_id: int) -> dict: ...
def update_customer(customer_id: int, name: str = None, email: str = None) -> dict: ...
def delete_customer(customer_id: int) -> dict: ...

원칙 2: 읽기 vs 쓰기 분리

읽기 툴은 부작용이 없습니다. 쓰기 툴은 상태를 변경합니다. 명확히 분리하세요.

# 읽기 툴: 안전, 여러 번 호출해도 무방
def get_user_balance(user_id: int) -> float: ...
def search_products(query: str) -> list: ...

# 쓰기 툴: 비가역적, 조심해야 함
def transfer_money(from_id: int, to_id: int, amount: float) -> dict: ...
def delete_order(order_id: int) -> dict: ...

원칙 3: 구조화된 데이터 반환

자연어 문장이 아닌 JSON을 반환하세요. LLM이 파싱하고 추론하기 훨씬 쉽습니다.

# 나쁜 예: LLM이 파싱해야 하는 자연어
def get_user_info(user_id: int) -> str:
    return f"John Doe, 30 years old, email: john@example.com, joined 2022"

# 좋은 예: 구조화된 JSON
def get_user_info(user_id: int) -> dict:
    return {
        "id": user_id,
        "name": "John Doe",
        "age": 30,
        "email": "john@example.com",
        "joined_at": "2022-01-15"
    }

원칙 4: 에러 정보를 결과에 포함

def safe_tool_wrapper(func):
    """툴 함수를 에러 처리와 함께 래핑"""
    def wrapper(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
            return {"success": True, "data": result}
        except ValueError as e:
            return {"success": False, "error": "invalid_input", "message": str(e)}
        except PermissionError as e:
            return {"success": False, "error": "permission_denied", "message": str(e)}
        except Exception as e:
            return {"success": False, "error": "unexpected", "message": str(e)}
    return wrapper

@safe_tool_wrapper
def create_order(product_id: int, quantity: int, user_id: int) -> dict:
    # 주문 생성 로직
    ...

프로덕션 체크리스트

Tool Calling을 프로덕션에 배포할 때 확인해야 할 항목들:

class ToolCallMiddleware:
    """프로덕션용 툴 호출 미들웨어"""

    def __init__(self, tools: dict, max_iterations: int = 10):
        self.tools = tools
        self.max_iterations = max_iterations
        self.call_log = []  # 모든 호출 기록

    async def execute(self, tool_name: str, args: dict, timeout: int = 30) -> dict:
        start_time = time.time()

        # 1. 툴 존재 확인
        if tool_name not in self.tools:
            return {"error": f"Unknown tool: {tool_name}"}

        # 2. 입력 검증 (중요!)
        try:
            validated_args = self._validate_args(tool_name, args)
        except ValueError as e:
            return {"error": f"Invalid arguments: {str(e)}"}

        # 3. 타임아웃과 함께 실행
        try:
            result = await asyncio.wait_for(
                asyncio.to_thread(self.tools[tool_name], **validated_args),
                timeout=timeout
            )
        except asyncio.TimeoutError:
            result = {"error": f"Timed out after {timeout}s"}
        except Exception as e:
            result = {"error": str(e)}

        # 4. 실행 로그 기록
        elapsed = time.time() - start_time
        self.call_log.append({
            "tool": tool_name,
            "args": args,
            "elapsed": elapsed,
            "success": "error" not in result,
            "timestamp": time.time()
        })

        return result

마치며

Tool Calling은 LLM을 에이전트로 만드는 핵심 기술입니다. 기본 구현은 어렵지 않지만, 잘 동작하는 Tool Calling을 만드는 것은 디테일의 싸움입니다.

가장 중요한 것 세 가지: 명확한 툴 설명, 에러 처리, 루프 방지. 이 세 가지만 잘 해도 대부분의 문제를 피할 수 있습니다.

이 시리즈를 통해 에이전트 설계 패턴(Post 5), MCP(Post 6), Multi-Agent 시스템(Post 7), 그리고 Tool Calling(Post 8)을 다뤘습니다. 이제 실제로 무언가를 만들어 보세요 — 이론보다 구현에서 배우는 게 훨씬 빠릅니다.

Tool Calling in Practice: How AI Interacts with the World and Common Pitfalls

Why Tool Calling Is the Core of Agents
OpenAI Function Calling: Full Implementation
Anthropic Claude Tool Use
Five Common Mistakes and How to Fix Them
Parallel Tool Calls
Tool Design Principles
Production Middleware
Wrapping Up

An LLM on its own transforms text. Add tools and it can actually do things. Tool calling is the mechanism that turns a passive text generator into an active agent.

This post covers how tool calling actually works under the hood, how to implement it correctly, and the specific mistakes that trip up every team building agent systems in production.

Why Tool Calling Is the Core of Agents

An agent's capabilities are exactly as wide as its available tools:

Search tool → access to real-time information
Calculator tool → guaranteed mathematical accuracy
Code execution tool → write and run actual code
API tool → integrate with external services
Database tool → read and write persistent data

Without tools, an LLM can only answer from its training data. With tools, it can look up today's stock prices, send emails, and execute code.

OpenAI Function Calling: Full Implementation

OpenAI's function calling has become the de facto standard API format.

Define Your Tools

import openai
import json
from typing import Any

client = openai.OpenAI(api_key="your-api-key")

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": (
                "Get current weather for a specific city. "
                "Use this when the user asks about current weather conditions. "
                "Do NOT use for weather forecasts or historical data."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "City name in English, e.g. 'Seoul', 'Tokyo', 'New York'"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit. Default: celsius"
                    }
                },
                "required": ["city"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_web",
            "description": (
                "Search the web for current information. "
                "Use for recent news, events, or anything that may have changed "
                "since the model's training cutoff."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query"
                    },
                    "num_results": {
                        "type": "integer",
                        "description": "Number of results to return (1-10)",
                        "default": 3
                    }
                },
                "required": ["query"]
            }
        }
    }
]

Implement the Actual Functions

def get_weather(city: str, unit: str = "celsius") -> dict:
    response = weather_api.get(city=city, unit=unit)
    return {
        "city": city,
        "temperature": response.temp,
        "unit": unit,
        "condition": response.condition,
        "humidity": response.humidity
    }

def search_web(query: str, num_results: int = 3) -> list:
    results = search_api.search(query, count=num_results)
    return [
        {"title": r.title, "snippet": r.snippet, "url": r.url}
        for r in results
    ]

available_tools = {
    "get_weather": get_weather,
    "search_web": search_web
}

The Complete Tool Call Loop

def run_agent(user_message: str, max_iterations: int = 10) -> str:
    messages = [{"role": "user", "content": user_message}]

    for iteration in range(max_iterations):
        response = client.chat.completions.create(
            model="gpt-4",
            messages=messages,
            tools=tools,
            tool_choice="auto"
        )

        assistant_message = response.choices[0].message
        messages.append(assistant_message)

        # No tool calls → final answer
        if not assistant_message.tool_calls:
            return assistant_message.content

        # Execute each tool call
        for tool_call in assistant_message.tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)

            print(f"Calling: {function_name}({function_args})")

            if function_name in available_tools:
                try:
                    result = available_tools[function_name](**function_args)
                    tool_result = json.dumps(result, ensure_ascii=False)
                except Exception as e:
                    tool_result = f"Error: {str(e)}. Try a different approach."
            else:
                tool_result = f"Unknown tool: {function_name}"

            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": tool_result
            })

    return "Max iterations reached"

result = run_agent("What's the weather in Seoul? Also find me today's AI news.")
print(result)

Anthropic Claude Tool Use

Claude uses a different format but the same concept:

import anthropic
import json

client = anthropic.Anthropic(api_key="your-api-key")

tools = [
    {
        "name": "get_weather",
        "description": "Get current weather for a city",
        "input_schema": {
            "type": "object",
            "properties": {
                "city": {"type": "string", "description": "City name"}
            },
            "required": ["city"]
        }
    }
]

def run_claude_agent(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    while True:
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            tools=tools,
            messages=messages
        )

        messages.append({"role": "assistant", "content": response.content})

        if response.stop_reason == "end_turn":
            return " ".join(
                block.text for block in response.content
                if hasattr(block, "text")
            )

        # Handle tool_use blocks
        tool_results = []
        for block in response.content:
            if block.type == "tool_use":
                try:
                    result = available_tools[block.name](**block.input)
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": json.dumps(result)
                    })
                except Exception as e:
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": block.id,
                        "content": f"Error: {str(e)}",
                        "is_error": True
                    })

        messages.append({"role": "user", "content": tool_results})

Five Common Mistakes and How to Fix Them

This section is the most valuable part of this post. These are real problems you will hit in production.

Mistake 1: Vague Tool Descriptions

The LLM uses your description to decide when and how to call your tool. Vague descriptions cause incorrect or unnecessary tool calls.

# Bad: ambiguous and unhelpful
{
    "name": "query",
    "description": "Query data"
}

# Good: when to use it, what it returns, what NOT to use it for
{
    "name": "query_customer_orders",
    "description": (
        "Fetch order history for a specific customer. Requires customer_id. "
        "Returns: list of orders with order ID, date, amount, and status. "
        "For customer profile info (name, email), use get_customer_info instead."
    )
}

Mistake 2: No Error Handling

Unhandled exceptions kill the entire agent loop.

# Bad: unhandled exception crashes the agent
result = database.query(sql)

# Good: errors become tool results — the LLM can adapt its approach
try:
    result = database.query(sql)
    return json.dumps(result)
except DatabaseError as e:
    return f"Database error: {str(e)}. The query may have a syntax error."
except TimeoutError:
    return "Query timed out. Try adding more specific filters or simplify the query."
except Exception as e:
    return f"Unexpected error: {str(e)}. Please try a different approach."

Mistake 3: No Loop Detection

An agent can get stuck calling the same tool with the same arguments repeatedly.

MAX_ITERATIONS = 15
call_count = {}

for iteration in range(MAX_ITERATIONS):
    # ...
    for tool_call in tool_calls:
        name = tool_call.function.name
        args_str = tool_call.function.arguments

        call_key = f"{name}:{args_str}"
        call_count[call_key] = call_count.get(call_key, 0) + 1

        if call_count[call_key] > 3:
            return "Error: same tool call repeated too many times. Breaking loop."

Mistake 4: Too Many Tools Defined at Once

LLM accuracy degrades when you provide too many tools. Beyond about 10, consider dynamic tool selection.

def select_relevant_tools(user_query: str, all_tools: list, max_tools: int = 8) -> list:
    """Select only tools relevant to the user's query"""
    if len(all_tools) <= max_tools:
        return all_tools

    query_words = set(user_query.lower().split())
    scored_tools = []

    for tool in all_tools:
        desc_words = set(tool["function"]["description"].lower().split())
        overlap = len(query_words & desc_words)
        scored_tools.append((overlap, tool))

    scored_tools.sort(key=lambda x: x[0], reverse=True)
    return [t for _, t in scored_tools[:max_tools]]

Mistake 5: No Human Confirmation for Irreversible Actions

Delete, payment, email send — these need human approval before execution.

HIGH_RISK_TOOLS = {"delete_record", "send_email", "process_payment", "deploy_code"}

def execute_with_approval(tool_name: str, args: dict) -> str:
    if tool_name in HIGH_RISK_TOOLS:
        # In a real app: show UI modal, send Slack notification, etc.
        print(f"HIGH RISK: {tool_name}({args})")
        approval = input("Approve? (yes/no): ")
        if approval.lower() != "yes":
            return "Action cancelled by user."

    return available_tools[tool_name](**args)

Parallel Tool Calls

Modern LLMs can request multiple tool calls in a single response. Use this — it makes agents dramatically faster.

import asyncio
from concurrent.futures import ThreadPoolExecutor
import time

async def execute_tool_calls_parallel(tool_calls: list) -> list:
    """Execute multiple tool calls concurrently"""

    async def execute_single(tool_call):
        name = tool_call.function.name
        args = json.loads(tool_call.function.arguments)

        try:
            loop = asyncio.get_event_loop()
            with ThreadPoolExecutor() as pool:
                result = await loop.run_in_executor(
                    pool,
                    lambda: available_tools[name](**args)
                )
            return {
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": json.dumps(result)
            }
        except Exception as e:
            return {
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": f"Error: {str(e)}"
            }

    tasks = [execute_single(tc) for tc in tool_calls]
    results = await asyncio.gather(*tasks)
    return results

# Example: "What's the weather in Seoul, Tokyo, and New York?"
# LLM returns three get_weather calls simultaneously
# Sequential: 3 API calls × 1s each = 3 seconds
# Parallel: 3 API calls simultaneously = ~1 second

Tool Design Principles

The difference between well-designed and poorly-designed tools shows up directly in agent performance.

Principle 1: Single Responsibility

Each tool does one thing.

# Bad: one function doing everything
def manage_customer(action: str, customer_id: int, **kwargs): ...

# Good: separate functions
def get_customer(customer_id: int) -> dict: ...
def update_customer(customer_id: int, name: str = None, email: str = None) -> dict: ...
def delete_customer(customer_id: int) -> dict: ...

Principle 2: Separate Reads from Writes

Read tools have no side effects. Write tools change state. Keep them clearly separate.

# Read tools: safe, idempotent, can be called multiple times
def get_user_balance(user_id: int) -> float: ...
def search_products(query: str) -> list: ...

# Write tools: irreversible, handle with care
def transfer_money(from_id: int, to_id: int, amount: float) -> dict: ...
def delete_order(order_id: int) -> dict: ...

Principle 3: Return Structured Data

Return JSON, not prose. The LLM reasons over structured data far better than natural language sentences.

# Bad: LLM has to parse this
def get_user_info(user_id: int) -> str:
    return f"John Doe, 30 years old, email: john@example.com, joined 2022"

# Good: structured data the LLM can reason about precisely
def get_user_info(user_id: int) -> dict:
    return {
        "id": user_id,
        "name": "John Doe",
        "age": 30,
        "email": "john@example.com",
        "joined_at": "2022-01-15"
    }

Principle 4: Include Error Details in Results

def safe_tool_wrapper(func):
    """Wrap tool functions with consistent error handling"""
    def wrapper(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
            return {"success": True, "data": result}
        except ValueError as e:
            return {"success": False, "error": "invalid_input", "message": str(e)}
        except PermissionError as e:
            return {"success": False, "error": "permission_denied", "message": str(e)}
        except Exception as e:
            return {"success": False, "error": "unexpected", "message": str(e)}
    return wrapper

@safe_tool_wrapper
def create_order(product_id: int, quantity: int, user_id: int) -> dict:
    ...

Production Middleware

Putting it all together — a middleware class that wraps tool execution with the safeguards you need in production:

import time
import asyncio

class ToolCallMiddleware:
    def __init__(self, tools: dict, max_iterations: int = 10):
        self.tools = tools
        self.max_iterations = max_iterations
        self.call_log = []

    async def execute(self, tool_name: str, args: dict, timeout: int = 30) -> dict:
        start_time = time.time()

        # 1. Tool existence check
        if tool_name not in self.tools:
            return {"error": f"Unknown tool: {tool_name}"}

        # 2. Input validation
        try:
            validated_args = self._validate_args(tool_name, args)
        except ValueError as e:
            return {"error": f"Invalid arguments: {str(e)}"}

        # 3. Execute with timeout
        try:
            result = await asyncio.wait_for(
                asyncio.to_thread(self.tools[tool_name], **validated_args),
                timeout=timeout
            )
        except asyncio.TimeoutError:
            result = {"error": f"Timed out after {timeout}s"}
        except Exception as e:
            result = {"error": str(e)}

        # 4. Log every call
        elapsed = time.time() - start_time
        self.call_log.append({
            "tool": tool_name,
            "args": args,
            "elapsed_ms": round(elapsed * 1000),
            "success": "error" not in result,
            "timestamp": time.time()
        })

        return result

Wrapping Up

Tool calling is what makes LLMs genuinely useful as agents. The basic implementation isn't complicated — but building tool calling that holds up in production is a game of details.

Three things matter most: clear tool descriptions, graceful error handling, and loop prevention. Get these right and you'll avoid the majority of the problems that sink tool-enabled agents.

This series covered agent design patterns (Post 5), MCP (Post 6), multi-agent systems (Post 7), and tool calling (Post 8). Now go build something — implementation teaches faster than theory.