Auto-invoked when creating Docker files, CI/CD pipelines, deployment configs, Kubernetes manifests, or infrastructure code. Enforces Alpha AI DevOps standards.
From cortexnpx claudepluginhub anthropics/claude-plugins-community --plugin cortexThis skill uses the workspace's default tool permissions.
Provides process, architecture, review, hiring, and testing guidelines for engineering teams relying on AI code generation.
Delivers DB-free sandbox API regression tests for Next.js/Vitest to catch AI blind spots in self-reviewed code changes like API routes and backend logic.
Compares coding agents like Claude Code and Aider on custom YAML-defined codebase tasks using git worktrees, measuring pass rate, cost, time, and consistency.
This skill enforces production-grade DevOps practices across all Alpha AI infrastructure. Every Dockerfile, CI/CD pipeline, Kubernetes manifest, and deployment configuration MUST comply with these standards.
python:3.11-slim, NOT python:latest)-slim or -alpine variants for smaller imageslatest tag in production Dockerfiles# WRONG
FROM python:latest
FROM node:alpine
# CORRECT
FROM python:3.11-slim
FROM node:20.11-alpine3.19
# REQUIRED: Multi-stage build pattern
# Stage 1: Build
FROM python:3.11-slim AS builder
WORKDIR /build
COPY requirements.txt .
RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
COPY . .
# Stage 2: Production
FROM python:3.11-slim AS production
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
WORKDIR /app
COPY --from=builder /install /usr/local
COPY --from=builder /build/app ./app
USER appuser
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
USER directive before CMD/ENTRYPOINT# REQUIRED: Non-root user setup
RUN groupadd -r appuser && useradd -r -g appuser -d /app -s /sbin/nologin appuser
COPY --chown=appuser:appuser . /app
USER appuser
# REQUIRED: Health check patterns
# Python FastAPI
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Node.js
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
CMD node -e "require('http').get('http://localhost:3000/health', (r) => { process.exit(r.statusCode === 200 ? 0 : 1) })" || exit 1
# Simple curl-based (if curl is installed)
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
.dockerignore to exclude unnecessary files# REQUIRED: Layer caching optimization
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Application code changes often; copy LAST
COPY . .
# REQUIRED: Clean up in same layer
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential=12.9 \
libpq-dev=15.* \
&& rm -rf /var/lib/apt/lists/*
.dockerignore# REQUIRED: .dockerignore minimum content
.git
.gitignore
.env
.env.*
__pycache__
*.pyc
*.pyo
.pytest_cache
.mypy_cache
.coverage
htmlcov
node_modules
.next
dist
build
*.md
docs/
tests/
.vscode
.idea
docker-compose*.yml
Dockerfile*
--no-install-recommends for apt--no-cache for apk# REQUIRED: Package pinning
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl=7.88.* \
ca-certificates=20230311 \
&& rm -rf /var/lib/apt/lists/*
# Alpine variant
RUN apk add --no-cache \
curl=8.5.0-r0 \
ca-certificates=20240226-r0
healthcheck configurationdepends_on with condition: service_healthy for dependencies# REQUIRED: Docker Compose service pattern
services:
api:
build:
context: .
dockerfile: Dockerfile
target: production
ports:
- "${API_PORT:-8000}:8000"
env_file:
- .env
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
cpus: "1.0"
reservations:
memory: 256M
cpus: "0.25"
restart: unless-stoppedrestart: "no"depends_on with condition: service_healthy (NOT just depends_on)# REQUIRED: Named volumes
volumes:
postgres_data:
driver: local
redis_data:
driver: local
services:
postgres:
image: postgres:16.1-alpine
volumes:
- postgres_data:/var/lib/postgresql/data
environment:
POSTGRES_USER: ${DB_USER}
POSTGRES_PASSWORD: ${DB_PASSWORD}
POSTGRES_DB: ${DB_NAME}
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
.env files for ALL environment variables (NEVER hardcode).env.example${VAR:-default}# REQUIRED: Environment file usage
services:
api:
env_file:
- .env
environment:
- DATABASE_URL=postgresql://${DB_USER}:${DB_PASSWORD}@postgres:5432/${DB_NAME}
# REQUIRED: Resource limits
deploy:
resources:
limits:
memory: 1G
cpus: "1.0"
reservations:
memory: 256M
cpus: "0.25"
docker-compose.yml (base), docker-compose.dev.yml, docker-compose.prod.ymldocker compose -f docker-compose.yml -f docker-compose.prod.yml up for productionEvery pipeline MUST include these stages in order:
# REQUIRED: CI/CD pipeline structure
name: CI/CD Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
env:
PYTHON_VERSION: "3.11"
NODE_VERSION: "20"
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install linting tools
run: pip install ruff black isort
- name: Run ruff
run: ruff check .
- name: Check formatting
run: black --check . && isort --check .
type-check:
runs-on: ubuntu-latest
needs: lint
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: pip install -r requirements.txt && pip install mypy
- name: Run mypy
run: mypy app/ --strict
test:
runs-on: ubuntu-latest
needs: type-check
services:
postgres:
image: postgres:16.1-alpine
env:
POSTGRES_USER: test
POSTGRES_PASSWORD: test
POSTGRES_DB: test_db
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
redis:
image: redis:7.2-alpine
ports:
- 6379:6379
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: pip install -r requirements.txt -r requirements-dev.txt
- name: Run tests
run: pytest --cov=app --cov-report=xml -n auto
env:
DATABASE_URL: postgresql://test:test@localhost:5432/test_db
REDIS_URL: redis://localhost:6379
- name: Upload coverage
uses: codecov/codecov-action@v4
with:
file: coverage.xml
build:
runs-on: ubuntu-latest
needs: test
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
- uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- uses: docker/build-push-action@v5
with:
context: .
push: ${{ github.ref == 'refs/heads/main' }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
security-scan:
runs-on: ubuntu-latest
needs: build
steps:
- uses: actions/checkout@v4
- name: Run Trivy image scan
uses: aquasecurity/trivy-action@master
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
format: 'sarif'
severity: 'CRITICAL,HIGH'
exit-code: '1'
- name: Run pip-audit
run: |
pip install pip-audit
pip-audit -r requirements.txt
deploy-staging:
runs-on: ubuntu-latest
needs: [security-scan]
if: github.ref == 'refs/heads/main'
environment: staging
steps:
- name: Deploy to staging
run: |
echo "Deploy to staging environment"
# kubectl set image deployment/app app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
deploy-production:
runs-on: ubuntu-latest
needs: [deploy-staging]
if: github.ref == 'refs/heads/main'
environment:
name: production
url: https://api.example.com
steps:
- name: Deploy to production
run: |
echo "Deploy to production environment"
# kubectl set image deployment/app app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
pytest -n auto, Jest --maxWorkers)needs dependencies to enforce ordering# REQUIRED: Resource specification
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
# REQUIRED: Health probes
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3
startupProbe:
httpGet:
path: /health
port: 8000
failureThreshold: 30
periodSeconds: 10
# REQUIRED: PDB for HA
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: api-pdb
spec:
minAvailable: 1
selector:
matchLabels:
app: api
# REQUIRED: HPA configuration
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: api-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: api
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
# REQUIRED: Config and secrets pattern
apiVersion: v1
kind: ConfigMap
metadata:
name: api-config
data:
LOG_LEVEL: "info"
WORKERS: "4"
DB_POOL_SIZE: "20"
---
apiVersion: v1
kind: Secret
metadata:
name: api-secrets
type: Opaque
data:
DATABASE_URL: <base64-encoded>
JWT_SECRET: <base64-encoded>
REDIS_URL: <base64-encoded>
# REQUIRED: Default deny ingress
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny-ingress
spec:
podSelector: {}
policyTypes:
- Ingress
---
# Allow traffic from specific sources
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-api-ingress
spec:
podSelector:
matchLabels:
app: api
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
app: nginx-ingress
ports:
- protocol: TCP
port: 8000
preferredDuringSchedulingIgnoredDuringExecution for flexible scheduling# REQUIRED: Pod anti-affinity
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- api
topologyKey: kubernetes.io/hostname
# REQUIRED: Full deployment template
apiVersion: apps/v1
kind: Deployment
metadata:
name: api
labels:
app: api
version: v1
spec:
replicas: 2
selector:
matchLabels:
app: api
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
template:
metadata:
labels:
app: api
version: v1
spec:
serviceAccountName: api-sa
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: api
image: ghcr.io/alpha-ai/api:sha-abc123
ports:
- containerPort: 8000
protocol: TCP
envFrom:
- configMapRef:
name: api-config
- secretRef:
name: api-secrets
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /health/ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 10
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- api
topologyKey: kubernetes.io/hostname
/metrics endpoint on EVERY service# REQUIRED: Prometheus metrics for FastAPI
from prometheus_fastapi_instrumentator import Instrumentator
instrumentator = Instrumentator(
should_group_status_codes=True,
should_instrument_requests_inprogress=True,
excluded_handlers=["/health", "/metrics"],
inprogress_name="http_requests_inprogress",
inprogress_labels=True,
)
instrumentator.instrument(app).expose(app, endpoint="/metrics")
# REQUIRED: Structured logging configuration
import structlog
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer(),
],
logger_factory=structlog.PrintLoggerFactory(),
)
Define alerts for every production service:
# REQUIRED: Prometheus alerting rules
groups:
- name: api-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High p95 latency detected"
# REQUIRED: Terraform provider pinning
terraform {
required_version = ">= 1.6.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.30"
}
}
backend "s3" {
bucket = "alpha-ai-terraform-state"
key = "prod/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}
# REQUIRED: Default tags
locals {
common_tags = {
Environment = var.environment
Service = var.service_name
Team = "alpha-ai"
ManagedBy = "terraform"
Repository = "alpha-ai/infrastructure"
}
}
Every PR touching infrastructure code MUST verify:
latest)