Deploy ML models to production - APIs, containerization, monitoring, and MLOps
Deploy ML models to production with FastAPI, Docker, and monitoring. Use when you need to containerize models, create APIs, or set up MLOps pipelines.
/plugin marketplace add pluginagentmarketplace/custom-plugin-machine-learning/plugin install machine-learning-assistant@pluginagentmarketplace-machine-learningThis skill inherits all available tools. When active, it can use any tool Claude has access to.
assets/config.yamlassets/schema.jsonreferences/GUIDE.mdreferences/PATTERNS.mdscripts/validate.pyTake models from development to production.
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
import joblib
app = FastAPI(title="ML Model API")
model = joblib.load('model.pkl')
class PredictRequest(BaseModel):
features: list[float]
class PredictResponse(BaseModel):
prediction: float
@app.post("/predict", response_model=PredictResponse)
async def predict(request: PredictRequest):
X = np.array([request.features])
prediction = model.predict(X)[0]
return PredictResponse(prediction=float(prediction))
@app.get("/health")
async def health():
return {"status": "healthy"}
import torch
import torch.onnx
# Export PyTorch to ONNX
def export_to_onnx(model, sample_input, path='model.onnx'):
model.eval()
torch.onnx.export(
model,
sample_input,
path,
export_params=True,
opset_version=14,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch'}, 'output': {0: 'batch'}}
)
# ONNX inference
import onnxruntime as ort
session = ort.InferenceSession('model.onnx')
input_name = session.get_inputs()[0].name
output = session.run(None, {input_name: input_data})[0]
# Dockerfile
FROM python:3.10-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
# docker-compose.yml
version: '3.8'
services:
api:
build: .
ports:
- "8000:8000"
environment:
- MODEL_PATH=/models/model.onnx
volumes:
- ./models:/models:ro
restart: unless-stopped
from prometheus_client import Counter, Histogram, start_http_server
# Define metrics
REQUESTS = Counter('model_requests_total', 'Total requests', ['status'])
LATENCY = Histogram('model_latency_seconds', 'Latency in seconds')
@app.post("/predict")
async def predict(request: PredictRequest):
import time
start = time.time()
try:
prediction = model.predict(request.features)
REQUESTS.labels(status='success').inc()
return {"prediction": prediction}
except Exception as e:
REQUESTS.labels(status='error').inc()
raise
finally:
LATENCY.observe(time.time() - start)
import mlflow
# Log model
with mlflow.start_run():
mlflow.log_params({"n_estimators": 100, "max_depth": 10})
mlflow.log_metrics({"accuracy": 0.95, "f1": 0.93})
mlflow.sklearn.log_model(model, "model")
# Load model
model_uri = "runs:/abc123/model"
model = mlflow.sklearn.load_model(model_uri)
import random
class ABTest:
def __init__(self, variants: dict[str, float]):
self.variants = variants # {"A": 0.5, "B": 0.5}
self.results = {v: {"count": 0, "success": 0} for v in variants}
def get_variant(self, user_id: str) -> str:
random.seed(hash(user_id))
r = random.random()
cumulative = 0
for variant, weight in self.variants.items():
cumulative += weight
if r <= cumulative:
return variant
return list(self.variants.keys())[-1]
def record(self, variant: str, success: bool):
self.results[variant]["count"] += 1
if success:
self.results[variant]["success"] += 1
# TODO: Create a FastAPI service that:
# 1. Loads a model on startup
# 2. Has /predict and /health endpoints
# 3. Validates input with Pydantic
# TODO: Containerize your ML service
# Create Dockerfile and docker-compose.yml
import pytest
from fastapi.testclient import TestClient
def test_health_endpoint():
"""Test health check."""
client = TestClient(app)
response = client.get("/health")
assert response.status_code == 200
assert response.json()["status"] == "healthy"
def test_predict_endpoint():
"""Test prediction."""
client = TestClient(app)
response = client.post("/predict", json={"features": [1.0, 2.0, 3.0]})
assert response.status_code == 200
assert "prediction" in response.json()
| Problem | Cause | Solution |
|---|---|---|
| High latency | Model too large | Quantize, use ONNX |
| Memory leaks | Poor cleanup | Implement proper lifecycle |
| API errors | Input validation | Add Pydantic schemas |
| Scaling issues | Blocking I/O | Use async, add workers |
07-model-deploymentcomputer-visionVersion: 1.4.0 | Status: Production Ready
Use when working with Payload CMS projects (payload.config.ts, collections, fields, hooks, access control, Payload API). Use when debugging validation errors, security issues, relationship queries, transactions, or hook behavior.
Applies Anthropic's official brand colors and typography to any sort of artifact that may benefit from having Anthropic's look-and-feel. Use it when brand colors or style guidelines, visual formatting, or company design standards apply.
Creating algorithmic art using p5.js with seeded randomness and interactive parameter exploration. Use this when users request creating art using code, generative art, algorithmic art, flow fields, or particle systems. Create original algorithmic art rather than copying existing artists' work to avoid copyright violations.