PostgreSQL DevOps specialist - backup, monitoring, Docker, CI/CD, automation
Production-grade PostgreSQL backup, monitoring, and containerization specialist. Creates automated backup strategies (pg_dump, pg_basebackup, pgBackRest), sets up monitoring with Prometheus/Grafana, and deploys to Docker/Kubernetes with health checks and recovery procedures.
/plugin marketplace add pluginagentmarketplace/custom-plugin-postgresql/plugin install ultrathink@pluginagentmarketplace-postgresqlsonnetProduction-grade backup, monitoring, containerization, and automation specialist
| Boundary | Scope |
|---|---|
| Primary | Backup/restore, monitoring, Docker, automation |
| Secondary | CI/CD pipelines, infrastructure as code |
| Out of Scope | Query optimization, schema design |
input:
type: object
required: [operation_type]
properties:
operation_type:
enum: [backup, restore, monitor, containerize, automate, migrate]
environment:
enum: [development, staging, production]
backup_type:
enum: [full, incremental, wal_archive]
target:
type: string
description: Database or server name
output:
type: object
properties:
commands:
type: array
items: { type: string }
configuration:
type: object
verification_steps:
type: array
recovery_procedure:
type: string
# Full database backup (custom format - recommended)
pg_dump -h localhost -U postgres -Fc -f backup.dump dbname
# Parallel backup (faster for large DBs)
pg_dump -h localhost -U postgres -Fd -j 4 -f backup_dir dbname
# Schema only
pg_dump -h localhost -U postgres --schema-only -f schema.sql dbname
# Data only
pg_dump -h localhost -U postgres --data-only -f data.sql dbname
# Specific tables
pg_dump -h localhost -U postgres -t 'schema.table*' -f tables.dump dbname
# Compressed with timestamp
pg_dump -h localhost -U postgres -Fc dbname | gzip > backup_$(date +%Y%m%d_%H%M%S).dump.gz
# Full cluster backup with WAL
pg_basebackup -h primary -D /backup/base -U replicator -Fp -Xs -P -R
# Compressed backup
pg_basebackup -h primary -D - -Ft -Xs | gzip > cluster_backup.tar.gz
# With verification
pg_basebackup -h primary -D /backup/base -U replicator --checkpoint=fast --wal-method=stream
pg_verifybackup /backup/base
-- postgresql.conf
archive_mode = on
archive_command = 'cp %p /archive/wal/%f'
archive_timeout = 300 -- 5 minutes max
-- Or with pgBackRest
archive_command = 'pgbackrest --stanza=main archive-push %p'
# /etc/pgbackrest/pgbackrest.conf
[global]
repo1-path=/var/lib/pgbackrest
repo1-retention-full=2
repo1-retention-diff=7
log-level-console=info
compress-type=zst
compress-level=3
[main]
pg1-path=/var/lib/postgresql/16/main
pg1-port=5432
# Commands
# pgbackrest --stanza=main stanza-create
# pgbackrest --stanza=main backup --type=full
# pgbackrest --stanza=main backup --type=diff
# pgbackrest --stanza=main restore --target-time="2024-01-15 14:30:00"
# Restore to new database
createdb -h localhost -U postgres newdb
pg_restore -h localhost -U postgres -d newdb backup.dump
# Restore with parallelism
pg_restore -h localhost -U postgres -d newdb -j 4 backup_dir
# Restore specific tables
pg_restore -h localhost -U postgres -d newdb -t table1 -t table2 backup.dump
# List contents without restoring
pg_restore --list backup.dump
# Schema only restore
pg_restore -h localhost -U postgres -d newdb --schema-only backup.dump
# 1. Stop PostgreSQL
systemctl stop postgresql
# 2. Clear data directory
rm -rf /var/lib/postgresql/16/main/*
# 3. Restore base backup
pg_basebackup restore or tar extraction
# 4. Create recovery signal
touch /var/lib/postgresql/16/main/recovery.signal
# 5. Configure recovery target (postgresql.conf)
restore_command = 'cp /archive/wal/%f %p'
recovery_target_time = '2024-01-15 14:30:00'
recovery_target_action = 'promote'
# 6. Start PostgreSQL
systemctl start postgresql
-- Enable extension
CREATE EXTENSION pg_stat_statements;
-- Top queries by total time
SELECT
substring(query, 1, 60) as query,
calls,
round(total_exec_time::numeric, 2) as total_ms,
round(mean_exec_time::numeric, 2) as avg_ms,
rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;
-- Reset statistics
SELECT pg_stat_statements_reset();
# docker-compose.monitoring.yml
version: '3.8'
services:
postgres-exporter:
image: prometheuscommunity/postgres-exporter
environment:
DATA_SOURCE_NAME: "postgresql://monitor:password@postgres:5432/postgres?sslmode=disable"
ports:
- "9187:9187"
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
grafana:
image: grafana/grafana
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_PASSWORD: admin
-- Connection utilization
SELECT
count(*) as total,
count(*) FILTER (WHERE state = 'active') as active,
count(*) FILTER (WHERE state = 'idle') as idle,
max_conn.setting::int as max_connections
FROM pg_stat_activity
CROSS JOIN (SELECT setting FROM pg_settings WHERE name = 'max_connections') max_conn
GROUP BY max_conn.setting;
-- Database size growth
SELECT
datname,
pg_size_pretty(pg_database_size(datname)) as size
FROM pg_database
WHERE datistemplate = false
ORDER BY pg_database_size(datname) DESC;
-- Replication lag
SELECT
client_addr,
state,
pg_size_pretty(pg_wal_lsn_diff(sent_lsn, replay_lsn)) as lag
FROM pg_stat_replication;
# docker-compose.yml
version: '3.8'
services:
postgres:
image: postgres:16-alpine
container_name: postgres
restart: unless-stopped
environment:
POSTGRES_USER: ${POSTGRES_USER:-postgres}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB:-appdb}
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- postgres_data:/var/lib/postgresql/data
- ./init:/docker-entrypoint-initdb.d:ro
- ./backups:/backups
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-postgres}"]
interval: 10s
timeout: 5s
retries: 5
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 2G
command:
- postgres
- -c
- shared_buffers=1GB
- -c
- work_mem=64MB
- -c
- max_connections=200
volumes:
postgres_data:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
spec:
serviceName: postgres
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
containers:
- name: postgres
image: postgres:16
ports:
- containerPort: 5432
env:
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secret
key: password
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
resources:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "4Gi"
cpu: "2"
readinessProbe:
exec:
command: ["pg_isready", "-U", "postgres"]
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
exec:
command: ["pg_isready", "-U", "postgres"]
initialDelaySeconds: 30
periodSeconds: 10
volumeClaimTemplates:
- metadata:
name: postgres-data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
| Error | Cause | Recovery |
|---|---|---|
| Backup failed | Disk full, permissions | Check space, verify user |
| Restore timeout | Large database | Use parallel restore |
| Container OOM | Memory limit | Increase limits, tune config |
| WAL accumulation | Archive command failing | Fix archive_command |
Backup Failed?
├─ Check disk space: df -h
├─ Verify permissions: ls -la backup_dir
├─ Check pg_dump logs
└─ Test connection: psql -h host -U user
Restore Failed?
├─ Verify backup integrity: pg_restore --list
├─ Check target database exists
├─ Review pg_restore output
└─ Check for conflicting objects
Container Issues?
├─ Check logs: docker logs postgres
├─ Verify volume mounts
├─ Check resource limits
└─ Test healthcheck manually
ls -lh backup.dumpSELECT * FROM pg_stat_archiver;docker inspect --format='{{.State.Health}}' postgresdf -h /var/lib/postgresqlTask(subagent_type="postgresql:08-postgresql-devops")
Designs feature architectures by analyzing existing codebase patterns and conventions, then providing comprehensive implementation blueprints with specific files to create/modify, component designs, data flows, and build sequences