Debug Modal issues including container failures, GPU errors, and deployment problems
Diagnose and fix Modal deployment issues like container failures, GPU errors, and timeouts. Provides debugging commands and solutions for common problems.
/plugin marketplace add JosiahSiegel/claude-plugin-marketplace/plugin install modal-master@claude-plugin-marketplaceDebug Modal issues including container failures, GPU errors, and deployment problems.
Help diagnose and resolve Modal issues:
Error Identification
Common Issues
Debugging Tools
# Interactive shell in container
modal shell app.py
# Shell with GPU
modal shell app.py --gpu A100
# View logs
modal app logs my-app
modal app logs my-app --follow
# List deployments
modal app list
# Check function status
modal app show my-app
Error: Container failed to start
Causes:
Solutions:
# Test image locally
modal shell app.py
# Check imports work
python -c "import your_module"
# Simplify image for debugging
image = modal.Image.debian_slim().pip_install("package")
Error: ModuleNotFoundError: No module named 'xxx'
Solutions:
# Add missing package
image = modal.Image.debian_slim().pip_install("missing_package")
# Use uv for faster installs
image = modal.Image.debian_slim().uv_pip_install("package")
# For local modules
image = image.add_local_python_source("my_module")
# Or use include_source
@app.function(include_source=True)
def my_func():
pass
Error: GPU type xxx not available
Solutions:
# Add fallbacks
@app.function(gpu=["H100", "A100-80GB", "A100", "any"])
def gpu_func():
pass
# Use "any" for flexibility
@app.function(gpu="any") # L4, A10G, or T4
def inference():
pass
Error: CUDA out of memory
Solutions:
# Use larger GPU
@app.function(gpu="A100-80GB") # Instead of A100-40GB
# Reduce batch size in code
batch_size = 8 # Instead of 32
# Clear cache in exit handler
@modal.exit()
def cleanup(self):
import torch
torch.cuda.empty_cache()
# Use gradient checkpointing for training
model.gradient_checkpointing_enable()
Error: Function timed out after xxx seconds
Solutions:
# Increase timeout
@app.function(timeout=3600) # 1 hour
def long_running():
pass
# For web endpoints (max 150s)
@app.function(timeout=150)
@modal.asgi_app()
def web():
return app
Error: Out of memory (CPU memory)
Solutions:
# Increase memory
@app.function(memory=16384) # 16 GB
def memory_intensive():
pass
# Process in chunks
def process_large_file(path):
for chunk in pd.read_csv(path, chunksize=10000):
process(chunk)
Error: Secret 'xxx' not found
Solutions:
# Check secret exists
modal secret list
# Create secret
modal secret create my-secret KEY=value
# Check environment
MODAL_ENVIRONMENT=prod modal secret list
# Verify secret name matches
@app.function(secrets=[modal.Secret.from_name("exact-name")])
def func():
pass
Error: Volume 'xxx' not found
Solutions:
# Create volume
modal volume create my-volume
# List volumes
modal volume list
# Create if missing
vol = modal.Volume.from_name("my-volume", create_if_missing=True)
Error: Name resolution failed or connection errors
Solutions:
# Add retries
@app.function(retries=3)
def network_func():
pass
# Increase timeout for slow APIs
import requests
response = requests.get(url, timeout=30)
Error: Image build failed
Solutions:
# Check package compatibility
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("build-essential") # For compiled packages
.pip_install("package")
)
# Build in stages for debugging
base_image = modal.Image.debian_slim()
with_deps = base_image.pip_install("dep1", "dep2")
final = with_deps.pip_install("main_package")
# Test the function
modal run app.py::function_name --arg value
# Get interactive shell
modal shell app.py
# View recent logs
modal app logs my-app
# Follow logs in real-time
modal app logs my-app --follow
# Minimal reproduction
@app.function()
def debug_func():
# Add one thing at a time
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
# Log resource usage
@app.function(gpu="A100")
def check_resources():
import torch
import psutil
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"CPU Memory: {psutil.virtual_memory().total / 1e9:.1f} GB")
import modal
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = modal.App("debug-app")
@app.function()
def debuggable_func(data):
logger.info(f"Starting with data: {data}")
try:
logger.info("Step 1: Processing")
result = process(data)
logger.info(f"Step 1 complete: {len(result)} items")
logger.info("Step 2: Transforming")
transformed = transform(result)
logger.info(f"Step 2 complete")
return transformed
except Exception as e:
logger.error(f"Error: {e}", exc_info=True)
raise
import time
@app.function()
def profile_func():
start = time.time()
# Phase 1
t1 = time.time()
step1()
print(f"Step 1: {time.time() - t1:.2f}s")
# Phase 2
t2 = time.time()
step2()
print(f"Step 2: {time.time() - t2:.2f}s")
print(f"Total: {time.time() - start:.2f}s")
modal run?