Chaos and Order

💡 왼쪽 원문을 읽으면서 오른쪽에 따라 써보세요. Tab 키로 힌트를 받을 수 있습니다.

들어가며
BentoML vs 직접 구현
설치와 기본 사용
- 모델 저장
- Service 정의
LLM 서빙 — OpenLLM 연동
멀티모델 파이프라인
Bento 빌드와 Docker
- bentofile.yaml
Kubernetes 배포
Adaptive Batching
모니터링
정리

들어가며

ML 모델을 학습시키는 것과 프로덕션에서 서빙하는 것은 완전히 다른 문제입니다. BentoML은 이 간극을 메우는 프레임워크로, 모델을 API 서비스로 패키징하고 어디서든 배포할 수 있게 합니다. Flask/FastAPI로 직접 API를 구축하는 것보다 훨씬 체계적인 접근법을 제공합니다.

BentoML vs 직접 구현

항목	Flask/FastAPI 직접 구현	BentoML
API 구현	수동 (라우팅, 직렬화)	데코레이터 기반 자동화
모델 버전 관리	직접 구현 필요	내장 Model Store
배치 처리	직접 구현	Adaptive Batching 내장
Docker 빌드	Dockerfile 수동 작성	자동 생성
GPU 지원	수동 설정	선언적 설정

설치와 기본 사용

pip install bentoml

모델 저장

# save_model.py
import bentoml
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# 모델 학습
X, y = load_iris(return_X_y=True)
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)

# BentoML Model Store에 저장
saved_model = bentoml.sklearn.save_model(
    "iris_classifier",
    model,
    signatures={"predict": {"batchable": True}},
    labels={"owner": "ml-team", "stage": "production"},
    metadata={"accuracy": 0.96, "dataset": "iris"},
)

print(f"Model saved: {saved_model}")
# Model saved: Model(tag="iris_classifier:abc123")

# 저장된 모델 확인
bentoml models list
# Tag                          Module    Size    Creation Time
# iris_classifier:abc123       sklearn   1.2MB   2026-03-03 05:00:00

Service 정의

# service.py
import bentoml
import numpy as np
from typing import Annotated

@bentoml.service(
    resources={"cpu": "2", "memory": "1Gi"},
    traffic={"timeout": 30, "concurrency": 32},
)
class IrisClassifier:
    model = bentoml.models.get("iris_classifier:latest")

    def __init__(self):
        self.clf = bentoml.sklearn.load_model(self.model)
        self.target_names = ["setosa", "versicolor", "virginica"]

    @bentoml.api
    def predict(
        self,
        features: Annotated[np.ndarray, bentoml.validators.Shape((4,))],
    ) -> dict:
        prediction = self.clf.predict([features])[0]
        probabilities = self.clf.predict_proba([features])[0]
        return {
            "class": self.target_names[prediction],
            "probability": float(max(probabilities)),
            "all_probabilities": {
                name: float(prob)
                for name, prob in zip(self.target_names, probabilities)
            },
        }

    @bentoml.api
    def predict_batch(
        self,
        features: Annotated[np.ndarray, bentoml.validators.Shape((-1, 4))],
    ) -> list[dict]:
        predictions = self.clf.predict(features)
        probabilities = self.clf.predict_proba(features)
        return [
            {
                "class": self.target_names[pred],
                "probability": float(max(probs)),
            }
            for pred, probs in zip(predictions, probabilities)
        ]

# 로컬 서빙
bentoml serve service:IrisClassifier

# 테스트
curl -X POST http://localhost:3000/predict \
  -H "Content-Type: application/json" \
  -d '{"features": [5.1, 3.5, 1.4, 0.2]}'
# {"class": "setosa", "probability": 0.98, ...}

LLM 서빙 — OpenLLM 연동

# llm_service.py
import bentoml
from vllm import LLM, SamplingParams

@bentoml.service(
    resources={"gpu": 1, "gpu_type": "nvidia-a100"},
    traffic={"timeout": 120, "concurrency": 16},
)
class LLMService:
    def __init__(self):
        self.llm = LLM(
            model="meta-llama/Llama-3.1-8B-Instruct",
            tensor_parallel_size=1,
            max_model_len=8192,
            gpu_memory_utilization=0.9,
        )

    @bentoml.api
    async def generate(self, prompt: str, max_tokens: int = 512) -> str:
        sampling_params = SamplingParams(
            temperature=0.7,
            top_p=0.9,
            max_tokens=max_tokens,
        )
        outputs = self.llm.generate([prompt], sampling_params)
        return outputs[0].outputs[0].text

    @bentoml.api
    async def chat(self, messages: list[dict]) -> str:
        prompt = self._format_chat(messages)
        return await self.generate(prompt)

    def _format_chat(self, messages):
        formatted = ""
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            formatted += f"<|{role}|>\n{content}\n"
        formatted += "<|assistant|>\n"
        return formatted

멀티모델 파이프라인

# pipeline_service.py
import bentoml
import numpy as np
from PIL import Image

@bentoml.service(resources={"cpu": "4", "memory": "4Gi"})
class ImageClassificationPipeline:
    # 여러 모델을 조합
    preprocessor = bentoml.depends(ImagePreprocessor)
    classifier = bentoml.depends(ImageClassifier)
    postprocessor = bentoml.depends(ResultPostprocessor)

    @bentoml.api
    async def classify(self, image: Image.Image) -> dict:
        # 1. 전처리
        features = await self.preprocessor.process(image)

        # 2. 분류
        raw_result = await self.classifier.predict(features)

        # 3. 후처리
        result = await self.postprocessor.format(raw_result)

        return result

@bentoml.service(resources={"cpu": "1"})
class ImagePreprocessor:
    @bentoml.api
    async def process(self, image: Image.Image) -> np.ndarray:
        img = image.resize((224, 224))
        arr = np.array(img) / 255.0
        return arr.transpose(2, 0, 1)

@bentoml.service(resources={"gpu": 1})
class ImageClassifier:
    model = bentoml.models.get("resnet50:latest")

    def __init__(self):
        import torch
        self.model = bentoml.pytorch.load_model(self.model)
        self.model.eval()
        self.device = torch.device("cuda")
        self.model.to(self.device)

    @bentoml.api
    async def predict(self, features: np.ndarray) -> np.ndarray:
        import torch
        tensor = torch.tensor(features).unsqueeze(0).float().to(self.device)
        with torch.no_grad():
            output = self.model(tensor)
        return output.cpu().numpy()

Bento 빌드와 Docker

bentofile.yaml

# bentofile.yaml
service: 'service:IrisClassifier'
labels:
  owner: ml-team
  project: iris-classifier
include:
  - '*.py'
python:
  packages:
    - scikit-learn==1.5.0
    - numpy
docker:
  python_version: '3.11'
  system_packages:
    - libgomp1
  env:
    BENTOML_PORT: '3000'

# Bento 빌드
bentoml build

# 빌드된 Bento 확인
bentoml list
# Tag                              Size     Creation Time
# iris_classifier_service:xyz789   45MB     2026-03-03

# Docker 이미지 생성
bentoml containerize iris_classifier_service:latest

# Docker 실행
docker run -p 3000:3000 iris_classifier_service:latest

Kubernetes 배포

# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: iris-classifier
  namespace: ml-serving
spec:
  replicas: 3
  selector:
    matchLabels:
      app: iris-classifier
  template:
    metadata:
      labels:
        app: iris-classifier
    spec:
      containers:
        - name: bento
          image: registry.example.com/iris_classifier_service:latest
          ports:
            - containerPort: 3000
          resources:
            requests:
              cpu: '1'
              memory: '1Gi'
            limits:
              cpu: '2'
              memory: '2Gi'
          readinessProbe:
            httpGet:
              path: /healthz
              port: 3000
            initialDelaySeconds: 10
          livenessProbe:
            httpGet:
              path: /healthz
              port: 3000
            initialDelaySeconds: 30
---
apiVersion: v1
kind: Service
metadata:
  name: iris-classifier
  namespace: ml-serving
spec:
  selector:
    app: iris-classifier
  ports:
    - port: 80
      targetPort: 3000
  type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: iris-classifier
  namespace: ml-serving
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: iris-classifier
  minReplicas: 2
  maxReplicas: 10
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

Adaptive Batching

BentoML의 핵심 기능으로, 여러 요청을 자동으로 묶어 GPU 활용률을 극대화합니다.

@bentoml.service(
    traffic={
        "timeout": 30,
    },
)
class EmbeddingService:
    model = bentoml.models.get("sentence-transformer:latest")

    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(self.model.path)

    @bentoml.api(
        batchable=True,
        batch_dim=0,
        max_batch_size=64,
        max_latency_ms=100,
    )
    async def encode(self, texts: list[str]) -> np.ndarray:
        # 개별 요청이 자동으로 배치로 묶여서 실행됨
        embeddings = self.model.encode(texts)
        return embeddings

모니터링

# 커스텀 메트릭 추가
import bentoml
from prometheus_client import Counter, Histogram

prediction_counter = Counter(
    "predictions_total", "Total predictions", ["model", "class"]
)
latency_histogram = Histogram(
    "prediction_latency_seconds", "Prediction latency"
)

@bentoml.service
class MonitoredClassifier:
    @bentoml.api
    def predict(self, features: np.ndarray) -> dict:
        with latency_histogram.time():
            result = self.clf.predict([features])[0]
            prediction_counter.labels(
                model="iris_v1", class_name=result
            ).inc()
            return {"class": result}

# Prometheus 메트릭 엔드포인트
curl http://localhost:3000/metrics

정리

BentoML은 ML 모델 서빙의 복잡성을 크게 줄여줍니다:

간편한 API 구현: 데코레이터 기반으로 몇 줄만에 REST API 생성
모델 버전 관리: 내장 Model Store로 체계적 관리
Adaptive Batching: GPU 활용률 극대화
Docker 자동화: bentofile.yaml로 재현 가능한 빌드
Kubernetes 네이티브: HPA와 함께 자동 스케일링

✅ 퀴즈: BentoML 이해도 점검 (7문제)

Q1. BentoML의 Model Store란?

학습된 모델을 버전 관리와 메타데이터와 함께 로컬에 저장하는 저장소입니다. bentoml.sklearn.save_model() 등으로 저장합니다.

Q2. Adaptive Batching의 동작 원리는?

개별 요청을 자동으로 모아 max_batch_size 또는 max_latency_ms에 도달하면 한 번에 처리하여 GPU 효율을 극대화합니다.

Q3. bentoml.depends()의 역할은?

멀티모델 파이프라인에서 다른 BentoML 서비스를 의존성으로 주입하여 서비스 간 통신을 자동 관리합니다.

Q4. bentofile.yaml에서 정의하는 것들은?

서비스 엔트리포인트, Python 패키지 의존성, Docker 설정, 포함할 파일 등을 선언합니다.

Q5. BentoML의 /healthz 엔드포인트 용도는?

Kubernetes의 readiness/liveness probe에 사용하여 서비스의 준비 상태와 생존 여부를 확인합니다.

Q6. GPU 리소스를 지정하는 방법은?

@bentoml.service(resources={"gpu": 1, "gpu_type": "nvidia-a100"}) 데코레이터로 선언합니다.

Q7. BentoML이 Flask/FastAPI 직접 구현보다 나은 점은?

모델 버전 관리, Adaptive Batching, Docker 자동 빌드, 선언적 리소스 관리 등이 내장되어 있어 프로덕션 준비가 빠릅니다.