Skill
monitoring-setup
Configure observability — structured logging, metrics (Prometheus/CloudWatch), tracing (OpenTelemetry), alerting, and dashboards. Use when the user says "set up monitoring", "add logging", "observability", "alerting", or needs to track application health in production.
From project-orchestratorInstall
1
Run in your terminal$
npx claudepluginhub vivekmano27/agent-orchestrator --plugin project-orchestratorTool Access
This skill is limited to using the following tools:
ReadWriteEditBashGrepGlob
Skill Content
Monitoring Setup Skill
Output Structure
Generate monitoring configuration files into the project structure:
monitoring/
├── prometheus/
│ ├── prometheus.yml # Scrape config
│ └── rules/
│ ├── app-alerts.yml # Application alert rules
│ └── infra-alerts.yml # Infrastructure alert rules
├── grafana/
│ └── dashboards/
│ └── app-overview.json # Pre-built dashboard (if applicable)
└── docker-compose.monitoring.yml # Local monitoring stack
src/
├── lib/logger.ts # Structured logger setup
├── middleware/request-logger.ts # HTTP request logging middleware
└── routes/health.ts # Health check endpoint
Structured Logging Format
Every log line must be JSON with these mandatory fields. Human-readable logs are for dev only — production always uses JSON.
// src/lib/logger.ts
import pino from 'pino';
export const logger = pino({
level: process.env.LOG_LEVEL || 'info',
formatters: {
level: (label) => ({ level: label }),
},
base: {
service: process.env.SERVICE_NAME || 'api',
version: process.env.APP_VERSION || 'unknown',
environment: process.env.NODE_ENV || 'development',
},
// Pretty print only in development
transport: process.env.NODE_ENV === 'development'
? { target: 'pino-pretty', options: { colorize: true } }
: undefined,
});
// Create child logger with correlation ID for request tracing
export function createRequestLogger(correlationId: string) {
return logger.child({ correlationId });
}
// src/middleware/request-logger.ts
import { randomUUID } from 'crypto';
import { createRequestLogger } from '../lib/logger';
export function requestLogger(req, res, next) {
const correlationId = req.headers['x-correlation-id'] || randomUUID();
req.log = createRequestLogger(correlationId);
// Set correlation ID on response for client-side tracing
res.setHeader('x-correlation-id', correlationId);
const start = Date.now();
res.on('finish', () => {
req.log.info({
method: req.method,
path: req.originalUrl,
statusCode: res.statusCode,
durationMs: Date.now() - start,
userAgent: req.headers['user-agent'],
// Never log request body by default (may contain PII/secrets)
}, `${req.method} ${req.originalUrl} ${res.statusCode}`);
});
next();
}
Log Line Examples
{"level":"info","time":1710500000,"service":"api","version":"1.2.0","environment":"production","correlationId":"550e8400-e29b-41d4-a716-446655440000","method":"POST","path":"/api/orders","statusCode":201,"durationMs":45,"msg":"POST /api/orders 201"}
{"level":"error","time":1710500001,"service":"api","version":"1.2.0","environment":"production","correlationId":"550e8400-e29b-41d4-a716-446655440000","err":{"type":"DatabaseError","message":"connection refused","stack":"..."},"msg":"Failed to create order"}
Health Check Endpoint
// src/routes/health.ts
interface HealthCheck {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
version: string;
checks: Record<string, { status: 'ok' | 'failing'; latencyMs?: number; message?: string }>;
}
export async function healthCheck(req, res) {
const checks: HealthCheck['checks'] = {};
// Database check with timeout
const dbStart = Date.now();
try {
await Promise.race([
db.query('SELECT 1'),
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 3000)),
]);
checks.database = { status: 'ok', latencyMs: Date.now() - dbStart };
} catch (err) {
checks.database = { status: 'failing', message: err.message };
}
// Redis check
const redisStart = Date.now();
try {
await redis.ping();
checks.redis = { status: 'ok', latencyMs: Date.now() - redisStart };
} catch (err) {
checks.redis = { status: 'failing', message: err.message };
}
const allHealthy = Object.values(checks).every(c => c.status === 'ok');
const anyFailing = Object.values(checks).some(c => c.status === 'failing');
const result: HealthCheck = {
status: anyFailing ? 'unhealthy' : 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
version: process.env.APP_VERSION || 'unknown',
checks,
};
// Return 503 if unhealthy so load balancers stop routing traffic
res.status(allHealthy ? 200 : 503).json(result);
}
Prometheus Scrape Configuration
# monitoring/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: "api"
metrics_path: /metrics
static_configs:
- targets: ["api:3000"]
labels:
service: api
environment: production
- job_name: "node-exporter"
static_configs:
- targets: ["node-exporter:9100"]
- job_name: "postgres-exporter"
static_configs:
- targets: ["postgres-exporter:9187"]
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
Alert Rule Templates
# monitoring/prometheus/rules/app-alerts.yml
groups:
- name: app.rules
rules:
# High error rate
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "Error rate above 5% for 5 minutes"
description: "{{ $value | humanizePercentage }} of requests are failing"
runbook: "https://wiki.example.com/runbooks/high-error-rate"
# Slow response times (p99)
- alert: HighP99Latency
expr: |
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
> 2.0
for: 10m
labels:
severity: warning
annotations:
summary: "p99 latency above 2 seconds for 10 minutes"
description: "p99 latency is {{ $value }}s"
# Pod restarts (Kubernetes)
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 3
for: 5m
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.pod }} is crash-looping"
description: "{{ $value }} restarts in the last 15 minutes"
# Database connection pool exhaustion
- alert: DBConnectionPoolExhausted
expr: |
pg_stat_activity_count / pg_settings_max_connections > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "DB connection pool above 80% capacity"
# Disk space
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 15m
labels:
severity: critical
annotations:
summary: "Less than 10% disk space remaining"
Constraints
- Every alert must have a
forduration. Alerts withoutforfire on single-sample spikes and cause alert fatigue. Minimumforis 5m for critical, 10m for warning. - Every alert must have a
runbookannotation. An alert without a runbook just wakes someone up with no guidance. Link to a wiki page with diagnosis steps. - Never log request/response bodies by default. They contain PII, passwords, tokens. Log them only in debug mode behind a feature flag, and only in non-production environments.
- Health check must return 503 when unhealthy. Returning 200 with
"status": "unhealthy"in the body defeats load balancer health checking — the LB reads the status code, not the body. - Correlation IDs must propagate across service boundaries. Read
x-correlation-idfrom incoming requests, generate one if missing, and pass it to all downstream HTTP calls and log entries. - Prometheus scrape interval and alert
forduration must be compatible. If scrape interval is 15s andforis 1m, you only get ~4 data points before the alert fires. Usefor >= 5 * scrape_intervalas a minimum.
Similar Skills
Stats
Parent Repo Stars0
Parent Repo Forks0
Last CommitMar 15, 2026