Run Python code in the cloud with serverless containers, GPUs, and autoscaling. Use when deploying ML models, running batch jobs, scheduling tasks, serving APIs with GPU acceleration, or scaling compute-intensive workloads. Triggers on requests for serverless GPU infrastructure, LLM inference, model training/fine-tuning, parallel data processing, cron jobs in the cloud, or deploying Python web endpoints.
/plugin marketplace add ferdousbhai/cloud-fullstack-skills/plugin install ferdousbhai-modal-deployment-plugins-modal-deployment@ferdousbhai/cloud-fullstack-skillsThis skill inherits all available tools. When active, it can use any tool Claude has access to.
references/dict.mdreferences/functions.mdreferences/gpu.mdreferences/images.mdreferences/networking.mdreferences/queue.mdreferences/sandbox.mdreferences/scaling.mdreferences/storage.mdreferences/web.mdModal is a serverless platform for running Python in the cloud with zero configuration. Define everything in code—no YAML, Docker, or Kubernetes required.
import modal
app = modal.App("my-app")
@app.function()
def hello():
return "Hello from Modal!"
@app.local_entrypoint()
def main():
print(hello.remote())
Run: modal run app.py
Decorate Python functions to run remotely:
@app.function(gpu="H100", memory=32768, timeout=600)
def train_model(data):
# Runs on H100 GPU with 32GB RAM, 10min timeout
return model.fit(data)
Define container environments via method chaining:
image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("ffmpeg", "libsndfile1")
.uv_pip_install("torch", "transformers", "numpy")
.env({"CUDA_VISIBLE_DEVICES": "0"})
)
app = modal.App("ml-app", image=image)
Key image methods:
.debian_slim() / .micromamba() - Base images.uv_pip_install() / .pip_install() - Python packages.apt_install() - System packages.run_commands() - Shell commands.add_local_python_source() - Local modules.env() - Environment variablesAttach GPUs with a single parameter:
@app.function(gpu="H100") # Single H100
@app.function(gpu="A100-80GB") # 80GB A100
@app.function(gpu="H100:4") # 4x H100
@app.function(gpu=["H100", "A100-40GB:2"]) # Fallback options
Available: B200, H200, H100, A100-80GB, A100-40GB, L40S, L4, A10G, T4
Load models once at container startup:
@app.cls(gpu="L40S")
class Model:
@modal.enter()
def load(self):
self.model = load_pretrained("model-name")
@modal.method()
def predict(self, x):
return self.model(x)
# Usage
Model().predict.remote(data)
Deploy APIs instantly:
@app.function()
@modal.fastapi_endpoint()
def api(text: str):
return {"result": process(text)}
# For complex apps
@app.function()
@modal.asgi_app()
def fastapi_app():
from fastapi import FastAPI
web = FastAPI()
@web.get("/health")
def health():
return {"status": "ok"}
return web
volume = modal.Volume.from_name("my-data", create_if_missing=True)
@app.function(volumes={"/data": volume})
def save_file(content: str):
with open("/data/output.txt", "w") as f:
f.write(content)
volume.commit() # Persist changes
@app.function(secrets=[modal.Secret.from_name("my-api-key")])
def call_api():
import os
key = os.environ["API_KEY"]
Create secrets: Dashboard or modal secret create my-secret KEY=value
cache = modal.Dict.from_name("my-cache", create_if_missing=True)
@app.function()
def cached_compute(key: str):
if key in cache:
return cache[key]
result = expensive_computation(key)
cache[key] = result
return result
queue = modal.Queue.from_name("task-queue", create_if_missing=True)
@app.function()
def producer():
queue.put_many([{"task": i} for i in range(10)])
@app.function()
def consumer():
while task := queue.get(timeout=60):
process(task)
# Map over inputs (auto-parallelized)
results = list(process.map(items))
# Spawn async jobs
calls = [process.spawn(item) for item in items]
results = [call.get() for call in calls]
# Batch processing (up to 1M inputs)
process.spawn_map(range(100_000))
@app.function(schedule=modal.Period(hours=1))
def hourly_job():
pass
@app.function(schedule=modal.Cron("0 9 * * 1-5")) # 9am weekdays
def daily_report():
pass
modal run app.py # Run locally-triggered function
modal serve app.py # Hot-reload web endpoints
modal deploy app.py # Deploy persistently
modal shell app.py # Interactive shell in container
modal app list # List deployed apps
modal app logs <name> # Stream logs
modal volume list # List volumes
modal secret list # List secrets
@app.cls(gpu="H100", image=image)
class LLM:
@modal.enter()
def load(self):
from vllm import LLM
self.llm = LLM("meta-llama/Llama-3-8B")
@modal.method()
def generate(self, prompt: str):
return self.llm.generate(prompt)
def download_model():
from huggingface_hub import snapshot_download
snapshot_download("model-id", local_dir="/models")
image = (
modal.Image.debian_slim()
.pip_install("huggingface-hub")
.run_function(download_model)
)
@app.function()
@modal.concurrent(max_inputs=100)
async def fetch_urls(url: str):
async with aiohttp.ClientSession() as session:
return await session.get(url)
@app.cls(enable_memory_snapshot=True, gpu="A10G")
class FastModel:
@modal.enter(snap=True)
def load(self):
self.model = load_model() # Snapshot this state
@app.function(
min_containers=2, # Always keep 2 warm
max_containers=100, # Scale up to 100
buffer_containers=5, # Extra buffer for bursts
scaledown_window=300, # Keep idle for 5 min
)
def serve():
pass
@modal.enter() for expensive initialization (model loading)See references/ for detailed guides on images, functions, GPUs, scaling, web endpoints, storage, dicts, queues, sandboxes, and networking.
Official docs: https://modal.com/docs