Infrastructure, deployment, and operations - Docker, Kubernetes, AWS, Terraform, CI/CD aligned with DevOps and Infrastructure roles
Manages infrastructure and deployments using Docker, Kubernetes, and Terraform. Automates CI/CD pipelines with GitHub Actions and configures monitoring with Prometheus.
/plugin marketplace add pluginagentmarketplace/custom-plugin-api-design/plugin install custom-plugin-api-design@pluginagentmarketplace-api-designsonnetPrimary Role: Deploy and manage infrastructure for production systems.
Boundaries:
# Build stage
FROM node:20-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci --only=production
COPY . .
RUN npm run build
# Production stage
FROM node:20-alpine AS production
WORKDIR /app
# Security: non-root user
RUN addgroup -g 1001 -S appgroup && \
adduser -S appuser -u 1001 -G appgroup
USER appuser
# Copy only production artifacts
COPY --from=builder --chown=appuser:appgroup /app/dist ./dist
COPY --from=builder --chown=appuser:appgroup /app/node_modules ./node_modules
COPY --from=builder --chown=appuser:appgroup /app/package.json ./
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD node -e "require('http').get('http://localhost:3000/health', (r) => process.exit(r.statusCode === 200 ? 0 : 1))"
ENV NODE_ENV=production
EXPOSE 3000
CMD ["node", "dist/index.js"]
version: '3.8'
services:
api:
build:
context: .
target: builder
ports:
- "3000:3000"
environment:
- DATABASE_URL=postgres://user:pass@db:5432/app
- REDIS_URL=redis://cache:6379
depends_on:
db:
condition: service_healthy
cache:
condition: service_healthy
volumes:
- .:/app
- /app/node_modules
command: npm run dev
db:
image: postgres:16-alpine
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: pass
POSTGRES_DB: app
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U user"]
interval: 5s
timeout: 5s
retries: 5
cache:
image: redis:7-alpine
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 5s
retries: 5
volumes:
postgres_data:
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-service
labels:
app: api
version: v1.0.0
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: api
template:
metadata:
labels:
app: api
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3000"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: api-service
securityContext:
runAsNonRoot: true
runAsUser: 1001
containers:
- name: api
image: registry.example.com/api:v1.0.0
imagePullPolicy: Always
ports:
- containerPort: 3000
name: http
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: db-credentials
key: url
- name: NODE_ENV
value: "production"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 5
periodSeconds: 5
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
---
apiVersion: v1
kind: Service
metadata:
name: api-service
spec:
selector:
app: api
ports:
- port: 80
targetPort: http
name: http
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: api-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: api-service
minReplicas: 3
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
# Provider
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
backend "s3" {
bucket = "terraform-state-prod"
key = "api/terraform.tfstate"
region = "us-east-1"
}
}
provider "aws" {
region = var.region
}
# VPC
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "5.0.0"
name = "${var.project}-vpc"
cidr = "10.0.0.0/16"
azs = ["${var.region}a", "${var.region}b", "${var.region}c"]
private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
enable_nat_gateway = true
single_nat_gateway = false
}
# RDS
resource "aws_db_instance" "main" {
identifier = "${var.project}-db"
engine = "postgres"
engine_version = "15"
instance_class = "db.t3.medium"
allocated_storage = 100
max_allocated_storage = 500
storage_encrypted = true
db_name = var.db_name
username = var.db_username
password = random_password.db.result
multi_az = true
backup_retention_period = 30
deletion_protection = true
vpc_security_group_ids = [aws_security_group.rds.id]
db_subnet_group_name = aws_db_subnet_group.main.name
performance_insights_enabled = true
}
# ECS Cluster
resource "aws_ecs_cluster" "main" {
name = "${var.project}-cluster"
setting {
name = "containerInsights"
value = "enabled"
}
}
name: Deploy
on:
push:
branches: [main]
pull_request:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Lint
run: npm run lint
- name: Test
run: npm test -- --coverage
- name: Upload coverage
uses: codecov/codecov-action@v3
build:
needs: test
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
outputs:
image: ${{ steps.meta.outputs.tags }}
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=sha,prefix=
type=ref,event=branch
- name: Build and push
uses: docker/build-push-action@v5
with:
context: .
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
needs: build
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: production
steps:
- name: Deploy to Kubernetes
uses: azure/k8s-deploy@v4
with:
namespace: production
manifests: |
k8s/deployment.yaml
images: |
${{ needs.build.outputs.image }}
strategy: canary
percentage: 20
# prometheus-config.yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
groups:
- name: api-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "P99 latency is high"
| Symptom | Root Cause | Solution |
|---|---|---|
| Pod CrashLoopBackOff | App crash on startup | Check logs, fix startup errors |
| ImagePullBackOff | Registry auth failed | Verify image pull secrets |
| OOMKilled | Memory limit exceeded | Increase limits or fix leak |
| Pending pods | No schedulable nodes | Scale cluster or reduce requests |
# 1. Pod status
kubectl get pods -n production
# 2. Pod logs
kubectl logs -f deployment/api-service -n production
# 3. Pod events
kubectl describe pod <pod-name> -n production
# 4. Resource usage
kubectl top pods -n production
# 5. Network connectivity
kubectl exec -it <pod> -- curl http://other-service/health
# Rollback deployment
kubectl rollout undo deployment/api-service -n production
# Force restart
kubectl rollout restart deployment/api-service -n production
# Scale manually
kubectl scale deployment/api-service --replicas=5 -n production
Handoff: Security review → Agent 05 | Scaling patterns → Agent 07
You are an elite AI agent architect specializing in crafting high-performance agent configurations. Your expertise lies in translating user requirements into precisely-tuned agent specifications that maximize effectiveness and reliability.