Split View: MLflow 완벽 가이드: 실험 추적부터 Model Registry, 프로덕션 배포까지
MLflow 완벽 가이드: 실험 추적부터 Model Registry, 프로덕션 배포까지
MLflow란?
MLflow는 ML 라이프사이클을 관리하는 오픈소스 플랫폼입니다. 네 가지 핵심 컴포넌트로 구성됩니다:
- MLflow Tracking: 실험 파라미터, 메트릭, 아티팩트 기록
- MLflow Projects: 재현 가능한 ML 코드 패키징
- MLflow Models: 다양한 프레임워크의 모델을 통일된 형식으로 패키징
- MLflow Model Registry: 모델 버전 관리 및 배포 워크플로우
설치 및 서버 설정
기본 설치
# pip 설치
pip install mlflow
# 추가 프레임워크 지원
pip install mlflow[extras] # sklearn, tensorflow, pytorch 등
# 서버 시작 (로컬)
mlflow server --host 0.0.0.0 --port 5000
# PostgreSQL + S3 백엔드로 프로덕션 서버
mlflow server \
--backend-store-uri postgresql://mlflow:password@localhost:5432/mlflow \
--default-artifact-root s3://mlflow-artifacts/ \
--host 0.0.0.0 --port 5000
Docker Compose로 배포
# docker-compose.yml
services:
mlflow:
image: ghcr.io/mlflow/mlflow:v2.18.0
ports:
- '5000:5000'
environment:
- MLFLOW_BACKEND_STORE_URI=postgresql://mlflow:password@postgres:5432/mlflow
- MLFLOW_DEFAULT_ARTIFACT_ROOT=s3://mlflow-artifacts/
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
command: >
mlflow server
--backend-store-uri postgresql://mlflow:password@postgres:5432/mlflow
--default-artifact-root s3://mlflow-artifacts/
--host 0.0.0.0 --port 5000
depends_on:
- postgres
postgres:
image: postgres:16
environment:
POSTGRES_USER: mlflow
POSTGRES_PASSWORD: password
POSTGRES_DB: mlflow
volumes:
- pgdata:/var/lib/postgresql/data
volumes:
pgdata:
실험 추적 (Tracking)
기본 사용법
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
# 트래킹 서버 설정
mlflow.set_tracking_uri("http://localhost:5000")
# 실험 생성/설정
mlflow.set_experiment("iris-classification")
# 데이터 준비
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 실험 실행
with mlflow.start_run(run_name="rf-baseline"):
# 파라미터 기록
params = {
"n_estimators": 100,
"max_depth": 5,
"min_samples_split": 2,
"random_state": 42
}
mlflow.log_params(params)
# 모델 학습
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)
# 예측 및 메트릭
y_pred = model.predict(X_test)
metrics = {
"accuracy": accuracy_score(y_test, y_pred),
"f1_macro": f1_score(y_test, y_pred, average="macro"),
"precision_macro": precision_score(y_test, y_pred, average="macro")
}
mlflow.log_metrics(metrics)
# 태그
mlflow.set_tag("model_type", "random_forest")
mlflow.set_tag("dataset", "iris")
# 모델 저장
mlflow.sklearn.log_model(
model,
artifact_path="model",
registered_model_name="iris-classifier"
)
# 커스텀 아티팩트 (그래프, 보고서 등)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm).plot(ax=ax)
fig.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
print(f"Run ID: {mlflow.active_run().info.run_id}")
print(f"Metrics: {metrics}")
하이퍼파라미터 튜닝 추적
import optuna
import mlflow
def objective(trial):
params = {
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
"max_depth": trial.suggest_int("max_depth", 2, 20),
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
}
with mlflow.start_run(nested=True, run_name=f"trial-{trial.number}"):
mlflow.log_params(params)
model = RandomForestClassifier(**params, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mlflow.log_metric("accuracy", accuracy)
return accuracy
# Optuna 스터디 실행
with mlflow.start_run(run_name="hyperparameter-tuning"):
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
# 최적 결과 기록
mlflow.log_params(study.best_params)
mlflow.log_metric("best_accuracy", study.best_value)
mlflow.set_tag("best_trial", study.best_trial.number)
PyTorch 모델 추적
import torch
import torch.nn as nn
import mlflow.pytorch
class SimpleNet(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
return self.fc2(self.relu(self.fc1(x)))
with mlflow.start_run(run_name="pytorch-model"):
model = SimpleNet(4, 32, 3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
mlflow.log_params({
"hidden_dim": 32,
"learning_rate": 0.001,
"optimizer": "Adam",
"epochs": 100
})
for epoch in range(100):
# 학습 로직...
loss = criterion(model(X_tensor), y_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 에폭별 메트릭 기록
mlflow.log_metric("train_loss", loss.item(), step=epoch)
# PyTorch 모델 저장
mlflow.pytorch.log_model(model, "model")
Model Registry
모델 등록 및 버전 관리
from mlflow import MlflowClient
client = MlflowClient()
# 모델 등록 (log_model에서 registered_model_name 사용 시 자동 등록)
# 또는 수동 등록:
result = client.create_registered_model(
name="iris-classifier",
description="Iris 꽃 분류 모델"
)
# 특정 실행의 모델을 버전으로 등록
model_version = client.create_model_version(
name="iris-classifier",
source=f"runs:/{run_id}/model",
run_id=run_id,
description="RandomForest baseline v1"
)
print(f"Model Version: {model_version.version}")
Alias를 활용한 배포 관리
# MLflow 2.x에서는 Alias 사용 (Stage는 deprecated)
client = MlflowClient()
# 프로덕션 alias 설정
client.set_registered_model_alias(
name="iris-classifier",
alias="champion",
version=3
)
# 도전자 모델 설정
client.set_registered_model_alias(
name="iris-classifier",
alias="challenger",
version=5
)
# Alias로 모델 로드
champion_model = mlflow.pyfunc.load_model("models:/iris-classifier@champion")
challenger_model = mlflow.pyfunc.load_model("models:/iris-classifier@challenger")
# A/B 테스트
champion_pred = champion_model.predict(X_test)
challenger_pred = challenger_model.predict(X_test)
print(f"Champion accuracy: {accuracy_score(y_test, champion_pred)}")
print(f"Challenger accuracy: {accuracy_score(y_test, challenger_pred)}")
모델 태그 활용
# 모델 버전에 태그 추가
client.set_model_version_tag(
name="iris-classifier",
version=3,
key="validation_status",
value="approved"
)
client.set_model_version_tag(
name="iris-classifier",
version=3,
key="approved_by",
value="data-science-lead"
)
# 태그로 모델 검색
from mlflow import search_model_versions
approved_versions = search_model_versions(
"name='iris-classifier' AND tag.validation_status='approved'"
)
모델 서빙
MLflow 내장 서빙
# 로컬 REST API 서빙
mlflow models serve \
-m "models:/iris-classifier@champion" \
--port 8080 \
--no-conda
# 테스트 요청
curl -X POST http://localhost:8080/invocations \
-H "Content-Type: application/json" \
-d '{"inputs": [[5.1, 3.5, 1.4, 0.2]]}'
FastAPI 커스텀 서빙
from fastapi import FastAPI
import mlflow.pyfunc
import numpy as np
app = FastAPI()
# 모델 로드 (서버 시작 시 1회)
model = mlflow.pyfunc.load_model("models:/iris-classifier@champion")
@app.post("/predict")
async def predict(features: list[list[float]]):
predictions = model.predict(np.array(features))
return {
"predictions": predictions.tolist(),
"model_version": "champion"
}
@app.get("/health")
async def health():
return {"status": "healthy", "model": "iris-classifier@champion"}
실험 비교 및 분석
MLflow UI에서 비교
# 실험 검색 (CLI)
mlflow runs list --experiment-id 1
# 메트릭 기반 검색
mlflow runs list \
--experiment-id 1 \
--filter "metrics.accuracy > 0.95" \
--order-by "metrics.accuracy DESC"
Python API로 분석
import mlflow
import pandas as pd
# 실험의 모든 실행 조회
runs = mlflow.search_runs(
experiment_ids=["1"],
filter_string="metrics.accuracy > 0.9",
order_by=["metrics.accuracy DESC"],
max_results=10
)
# DataFrame으로 분석
print(runs[["run_id", "params.n_estimators", "params.max_depth", "metrics.accuracy"]])
# 최적 실행 찾기
best_run = runs.iloc[0]
print(f"Best run: {best_run.run_id}, Accuracy: {best_run['metrics.accuracy']}")
프로덕션 체크리스트
□ 백엔드 스토어를 PostgreSQL/MySQL로 설정
□ 아티팩트 스토어를 S3/GCS/MinIO로 설정
□ 인증/권한 설정 (OIDC, Basic Auth)
□ 자동 실험 기록 (autolog) 설정
□ Model Registry alias 규칙 정립
□ CI/CD에서 모델 검증 자동화
□ 모델 서빙 헬스체크 설정
□ 실험 정리 정책 (오래된 실행 아카이브)
📝 확인 퀴즈 (6문제)
Q1. MLflow의 네 가지 핵심 컴포넌트는?
Tracking, Projects, Models, Model Registry
Q2. mlflow.log_params와 mlflow.log_metrics의 차이점은?
log_params는 학습 하이퍼파라미터(문자열)를 기록하고, log_metrics는 성능 지표(숫자)를 기록합니다. 메트릭은 step 파라미터로 에폭별 추적이 가능합니다.
Q3. MLflow 2.x에서 모델 배포 관리에 사용하는 개념은?
Alias (예: @champion, @challenger). Stage는 deprecated되었습니다.
Q4. nested=True 파라미터는 언제 사용하나요?
하이퍼파라미터 튜닝처럼 부모 실행 안에서 여러 자식 실행을 기록할 때 사용합니다.
Q5. 아티팩트 스토어로 S3를 사용하는 이유는?
모델 파일, 그래프 등 대용량 아티팩트를 확장 가능한 객체 스토리지에 저장하여 팀 간 공유와 버전 관리가 용이합니다.
Q6. mlflow.autolog()의 장점과 단점은?
장점: 코드 수정 없이 자동으로 파라미터/메트릭/모델을 기록. 단점: 불필요한 정보가 많이 기록될 수 있고, 커스텀 메트릭은 별도 기록 필요.
The Complete MLflow Guide: From Experiment Tracking to Model Registry and Production Deployment
- What is MLflow?
- Installation and Server Setup
- Experiment Tracking
- Model Registry
- Model Serving
- Experiment Comparison and Analysis
- Production Checklist
- Quiz
What is MLflow?
MLflow is an open-source platform for managing the ML lifecycle. It consists of four core components:
- MLflow Tracking: Records experiment parameters, metrics, and artifacts
- MLflow Projects: Packages ML code for reproducibility
- MLflow Models: Packages models from various frameworks in a unified format
- MLflow Model Registry: Model version management and deployment workflows
Installation and Server Setup
Basic Installation
# pip installation
pip install mlflow
# Additional framework support
pip install mlflow[extras] # sklearn, tensorflow, pytorch, etc.
# Start server (local)
mlflow server --host 0.0.0.0 --port 5000
# Production server with PostgreSQL + S3 backend
mlflow server \
--backend-store-uri postgresql://mlflow:password@localhost:5432/mlflow \
--default-artifact-root s3://mlflow-artifacts/ \
--host 0.0.0.0 --port 5000
Deployment with Docker Compose
# docker-compose.yml
services:
mlflow:
image: ghcr.io/mlflow/mlflow:v2.18.0
ports:
- '5000:5000'
environment:
- MLFLOW_BACKEND_STORE_URI=postgresql://mlflow:password@postgres:5432/mlflow
- MLFLOW_DEFAULT_ARTIFACT_ROOT=s3://mlflow-artifacts/
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
command: >
mlflow server
--backend-store-uri postgresql://mlflow:password@postgres:5432/mlflow
--default-artifact-root s3://mlflow-artifacts/
--host 0.0.0.0 --port 5000
depends_on:
- postgres
postgres:
image: postgres:16
environment:
POSTGRES_USER: mlflow
POSTGRES_PASSWORD: password
POSTGRES_DB: mlflow
volumes:
- pgdata:/var/lib/postgresql/data
volumes:
pgdata:
Experiment Tracking
Basic Usage
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
# Configure tracking server
mlflow.set_tracking_uri("http://localhost:5000")
# Create/set experiment
mlflow.set_experiment("iris-classification")
# Prepare data
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Run experiment
with mlflow.start_run(run_name="rf-baseline"):
# Log parameters
params = {
"n_estimators": 100,
"max_depth": 5,
"min_samples_split": 2,
"random_state": 42
}
mlflow.log_params(params)
# Train model
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)
# Predictions and metrics
y_pred = model.predict(X_test)
metrics = {
"accuracy": accuracy_score(y_test, y_pred),
"f1_macro": f1_score(y_test, y_pred, average="macro"),
"precision_macro": precision_score(y_test, y_pred, average="macro")
}
mlflow.log_metrics(metrics)
# Tags
mlflow.set_tag("model_type", "random_forest")
mlflow.set_tag("dataset", "iris")
# Save model
mlflow.sklearn.log_model(
model,
artifact_path="model",
registered_model_name="iris-classifier"
)
# Custom artifacts (plots, reports, etc.)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm).plot(ax=ax)
fig.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
print(f"Run ID: {mlflow.active_run().info.run_id}")
print(f"Metrics: {metrics}")
Hyperparameter Tuning Tracking
import optuna
import mlflow
def objective(trial):
params = {
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
"max_depth": trial.suggest_int("max_depth", 2, 20),
"min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
}
with mlflow.start_run(nested=True, run_name=f"trial-{trial.number}"):
mlflow.log_params(params)
model = RandomForestClassifier(**params, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mlflow.log_metric("accuracy", accuracy)
return accuracy
# Run Optuna study
with mlflow.start_run(run_name="hyperparameter-tuning"):
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
# Log best results
mlflow.log_params(study.best_params)
mlflow.log_metric("best_accuracy", study.best_value)
mlflow.set_tag("best_trial", study.best_trial.number)
PyTorch Model Tracking
import torch
import torch.nn as nn
import mlflow.pytorch
class SimpleNet(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
return self.fc2(self.relu(self.fc1(x)))
with mlflow.start_run(run_name="pytorch-model"):
model = SimpleNet(4, 32, 3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
mlflow.log_params({
"hidden_dim": 32,
"learning_rate": 0.001,
"optimizer": "Adam",
"epochs": 100
})
for epoch in range(100):
# Training logic...
loss = criterion(model(X_tensor), y_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Log per-epoch metrics
mlflow.log_metric("train_loss", loss.item(), step=epoch)
# Save PyTorch model
mlflow.pytorch.log_model(model, "model")
Model Registry
Model Registration and Version Management
from mlflow import MlflowClient
client = MlflowClient()
# Register model (auto-registered when using registered_model_name in log_model)
# Or manually register:
result = client.create_registered_model(
name="iris-classifier",
description="Iris flower classification model"
)
# Register a specific run's model as a version
model_version = client.create_model_version(
name="iris-classifier",
source=f"runs:/{run_id}/model",
run_id=run_id,
description="RandomForest baseline v1"
)
print(f"Model Version: {model_version.version}")
Deployment Management with Aliases
# MLflow 2.x uses Aliases (Stage is deprecated)
client = MlflowClient()
# Set production alias
client.set_registered_model_alias(
name="iris-classifier",
alias="champion",
version=3
)
# Set challenger model
client.set_registered_model_alias(
name="iris-classifier",
alias="challenger",
version=5
)
# Load model by alias
champion_model = mlflow.pyfunc.load_model("models:/iris-classifier@champion")
challenger_model = mlflow.pyfunc.load_model("models:/iris-classifier@challenger")
# A/B testing
champion_pred = champion_model.predict(X_test)
challenger_pred = challenger_model.predict(X_test)
print(f"Champion accuracy: {accuracy_score(y_test, champion_pred)}")
print(f"Challenger accuracy: {accuracy_score(y_test, challenger_pred)}")
Using Model Tags
# Add tags to model version
client.set_model_version_tag(
name="iris-classifier",
version=3,
key="validation_status",
value="approved"
)
client.set_model_version_tag(
name="iris-classifier",
version=3,
key="approved_by",
value="data-science-lead"
)
# Search models by tag
from mlflow import search_model_versions
approved_versions = search_model_versions(
"name='iris-classifier' AND tag.validation_status='approved'"
)
Model Serving
Built-in MLflow Serving
# Local REST API serving
mlflow models serve \
-m "models:/iris-classifier@champion" \
--port 8080 \
--no-conda
# Test request
curl -X POST http://localhost:8080/invocations \
-H "Content-Type: application/json" \
-d '{"inputs": [[5.1, 3.5, 1.4, 0.2]]}'
Custom Serving with FastAPI
from fastapi import FastAPI
import mlflow.pyfunc
import numpy as np
app = FastAPI()
# Load model (once at server startup)
model = mlflow.pyfunc.load_model("models:/iris-classifier@champion")
@app.post("/predict")
async def predict(features: list[list[float]]):
predictions = model.predict(np.array(features))
return {
"predictions": predictions.tolist(),
"model_version": "champion"
}
@app.get("/health")
async def health():
return {"status": "healthy", "model": "iris-classifier@champion"}
Experiment Comparison and Analysis
Comparing in MLflow UI
# Search experiments (CLI)
mlflow runs list --experiment-id 1
# Search by metrics
mlflow runs list \
--experiment-id 1 \
--filter "metrics.accuracy > 0.95" \
--order-by "metrics.accuracy DESC"
Analysis with Python API
import mlflow
import pandas as pd
# Query all runs in an experiment
runs = mlflow.search_runs(
experiment_ids=["1"],
filter_string="metrics.accuracy > 0.9",
order_by=["metrics.accuracy DESC"],
max_results=10
)
# Analyze as DataFrame
print(runs[["run_id", "params.n_estimators", "params.max_depth", "metrics.accuracy"]])
# Find the best run
best_run = runs.iloc[0]
print(f"Best run: {best_run.run_id}, Accuracy: {best_run['metrics.accuracy']}")
Production Checklist
□ Set backend store to PostgreSQL/MySQL
□ Set artifact store to S3/GCS/MinIO
□ Configure authentication/authorization (OIDC, Basic Auth)
□ Set up automatic experiment logging (autolog)
□ Establish Model Registry alias conventions
□ Automate model validation in CI/CD
□ Configure model serving health checks
□ Define experiment cleanup policies (archive old runs)
Review Quiz (6 Questions)
Q1. What are the four core components of MLflow?
Tracking, Projects, Models, Model Registry
Q2. What is the difference between mlflow.log_params and mlflow.log_metrics?
log_params records training hyperparameters (strings), while log_metrics records performance metrics (numbers). Metrics support per-epoch tracking with the step parameter.
Q3. What concept is used for model deployment management in MLflow 2.x?
Aliases (e.g., @champion, @challenger). Stage has been deprecated.
Q4. When is the nested=True parameter used?
It is used when recording multiple child runs inside a parent run, such as during hyperparameter tuning.
Q5. Why use S3 as the artifact store?
It stores large artifacts like model files and plots in scalable object storage, making it easy to share across teams and manage versions.
Q6. What are the pros and cons of mlflow.autolog()?
Pros: Automatically records parameters/metrics/models without code changes. Cons: May record unnecessary information, and custom metrics still need to be logged separately.
Quiz
Q1: What is the main topic covered in "The Complete MLflow Guide: From Experiment Tracking to
Model Registry and Production Deployment"?
A hands-on walkthrough of the entire ML experiment management workflow with MLflow. Covers recording experiments with Tracking, version management with Model Registry, and production deployment.
Q2: What is MLflow??
MLflow is an open-source platform for managing the ML lifecycle. It consists of four core
components: MLflow Tracking: Records experiment parameters, metrics, and artifacts MLflow
Projects: Packages ML code for reproducibility MLflow Models: Packages models from various
framework...
Q3: What are the key steps for Installation and Server Setup?
Basic Installation Deployment with Docker Compose
Q4: What are the key aspects of Experiment Tracking?
Basic Usage Hyperparameter Tuning Tracking PyTorch Model Tracking
Q5: How does Model Registry work?
Model Registration and Version Management Deployment Management with Aliases Using Model Tags