**Purpose**: Troubleshoot server, network, disk, and cloud infrastructure issues.
Diagnose and resolve infrastructure bottlenecks including high CPU, memory exhaustion, disk space issues, and network problems. Use this when servers are slow, applications are crashing, or you need to troubleshoot cloud resources like AWS EC2/EBS and Azure VMs.
/plugin marketplace add anton-abyzov/specweave/plugin install sw-infra@specweavePurpose: Troubleshoot server, network, disk, and cloud infrastructure issues.
Symptoms:
Diagnosis:
# Overall CPU usage
top -bn1 | grep "Cpu(s)"
# Top CPU processes
top -bn1 | head -20
# CPU usage per core
mpstat -P ALL 1 5
# Historical CPU (if sar installed)
sar -u 1 10
Red flags:
# Top CPU process
ps aux | sort -nrk 3,3 | head -10
# CPU per thread
top -H
# Process tree
pstree -p
Common causes:
# 1. Limit process CPU (nice)
renice +10 <PID> # Lower priority
# 2. Kill process (last resort)
kill -TERM <PID> # Graceful
kill -KILL <PID> # Force kill
# 3. Scale horizontally (add servers)
# Cloud: Auto-scaling group
# 4. Scale vertically (bigger instance)
# Cloud: Resize instance
Symptoms:
Diagnosis:
# Current memory usage
free -h
# Memory per process
ps aux | sort -nrk 4,4 | head -10
# Check OOM killer logs
dmesg | grep -i "out of memory\|oom"
grep "Out of memory" /var/log/syslog
# Check swap usage
swapon -s
Red flags:
# 1. Free page cache (safe)
sync && echo 3 > /proc/sys/vm/drop_caches
# 2. Kill memory-heavy process
kill -9 <PID>
# 3. Increase swap (temporary)
dd if=/dev/zero of=/swapfile bs=1M count=2048
mkswap /swapfile
swapon /swapfile
# 4. Scale up (more RAM)
# Cloud: Resize instance
Symptoms:
Diagnosis:
# Disk usage by partition
df -h
# Disk usage by directory
du -sh /*
du -sh /var/*
# Find large files
find / -type f -size +100M -exec ls -lh {} \;
# Find files using deleted space
lsof | grep deleted
Red flags:
# 1. Clean up logs
find /var/log -name "*.log.*" -mtime +7 -delete
journalctl --vacuum-time=7d
# 2. Clean up temp files
rm -rf /tmp/*
rm -rf /var/tmp/*
# 3. Find and remove deleted files holding space
lsof | grep deleted | awk '{print $2}' | xargs kill -9
# 4. Compress logs
gzip /var/log/*.log
# 5. Expand disk (cloud)
# AWS: Modify EBS volume size
# Azure: Expand managed disk
# After expanding:
resize2fs /dev/xvda1 # ext4
xfs_growfs / # xfs
Symptoms:
Diagnosis:
# Ping test
ping -c 5 google.com
# DNS resolution
nslookup example.com
dig example.com
# Traceroute
traceroute example.com
# Check network interfaces
ip addr show
ifconfig
# Check routing table
ip route show
route -n
Red flags:
# Current bandwidth usage
iftop -i eth0
# Network stats
netstat -i
# Historical bandwidth (if vnstat installed)
vnstat -l
# Check for bandwidth limits (cloud)
# AWS: Check CloudWatch NetworkIn/NetworkOut
# Check iptables rules
iptables -L -n -v
# Check firewalld (RHEL/CentOS)
firewall-cmd --list-all
# Check UFW (Ubuntu)
ufw status verbose
# Check security groups (cloud)
# AWS: EC2 → Security Groups
# Azure: Network Security Groups
Common causes:
# 1. Check firewall allows traffic
iptables -A INPUT -p tcp --dport 80 -j ACCEPT
iptables -A INPUT -p tcp --dport 443 -j ACCEPT
# 2. Restart networking
systemctl restart networking
systemctl restart NetworkManager
# 3. Flush DNS cache
systemd-resolve --flush-caches
# 4. Check cloud network ACLs
# Ensure subnet has route to internet gateway
Symptoms:
Diagnosis:
# Disk I/O stats
iostat -x 1 5
# Look for:
# - %util >80% (disk saturated)
# - await >100ms (high latency)
# Top I/O processes
iotop -o
# Historical I/O (if sar installed)
sar -d 1 10
Red flags:
# 1. Database without indexes (Seq Scan)
# See database-diagnostics.md
# 2. Log rotation running
# Large logs being compressed
# 3. Backup running
# Database dump, file backup
# 4. Disk issue (bad sectors)
dmesg | grep -i "I/O error"
smartctl -a /dev/sda # SMART status
# 1. Reduce I/O pressure
# Stop non-critical processes (backup, log rotation)
# 2. Add read cache
# Enable query caching (database)
# Add Redis for application cache
# 3. Scale disk IOPS (cloud)
# AWS: Change EBS volume type (gp2 → gp3 → io1)
# Azure: Change disk tier
# 4. Move to SSD (if on HDD)
Symptoms:
Diagnosis:
# Systemd services
systemctl status nginx
systemctl status postgresql
systemctl status application
# Check if process running
ps aux | grep nginx
pidof nginx
# Check service logs
journalctl -u nginx -n 50
tail -f /var/log/nginx/error.log
Red flags:
# Check system logs
dmesg | tail -50
grep "error\|segfault\|killed" /var/log/syslog
# Check application logs
tail -100 /var/log/application.log
# Check for OOM killer
dmesg | grep -i "killed process"
# Check core dumps
ls -l /var/crash/
ls -l /tmp/core*
Common causes:
# 1. Restart service
systemctl restart nginx
# 2. Check if started successfully
systemctl status nginx
curl http://localhost
# 3. If startup fails, check config
nginx -t # Test nginx config
postgresql -D /var/lib/postgresql/data --config-test
# 4. Enable auto-restart (systemd)
# Add to service file:
[Service]
Restart=always
RestartSec=10
Instance Issues:
# Check instance health
aws ec2 describe-instance-status --instance-ids i-1234567890abcdef0
# Check system logs
aws ec2 get-console-output --instance-id i-1234567890abcdef0
# Check CloudWatch metrics
aws cloudwatch get-metric-statistics \
--namespace AWS/EC2 \
--metric-name CPUUtilization \
--dimensions Name=InstanceId,Value=i-1234567890abcdef0
EBS Volume Issues:
# Check volume status
aws ec2 describe-volumes --volume-ids vol-1234567890abcdef0
# Increase IOPS (gp3)
aws ec2 modify-volume \
--volume-id vol-1234567890abcdef0 \
--iops 3000
# Check volume metrics
aws cloudwatch get-metric-statistics \
--namespace AWS/EBS \
--metric-name VolumeReadOps
Network Issues:
# Check security groups
aws ec2 describe-security-groups --group-ids sg-1234567890abcdef0
# Check network ACLs
aws ec2 describe-network-acls --network-acl-ids acl-1234567890abcdef0
# Check route tables
aws ec2 describe-route-tables --route-table-ids rtb-1234567890abcdef0
VM Issues:
# Check VM status
az vm get-instance-view --name myVM --resource-group myRG
# Restart VM
az vm restart --name myVM --resource-group myRG
# Resize VM
az vm resize --name myVM --resource-group myRG --size Standard_D4s_v3
Disk Issues:
# Check disk status
az disk show --name myDisk --resource-group myRG
# Expand disk
az disk update --name myDisk --resource-group myRG --size-gb 256
Server Health:
Uptime:
Response Time:
When diagnosing infrastructure issues:
Tools:
top, htop - CPU, memorydf, du - Disk usageiostat - Disk I/Oiftop, netstat - Networkdmesg, journalctl - System logsDesigns feature architectures by analyzing existing codebase patterns and conventions, then providing comprehensive implementation blueprints with specific files to create/modify, component designs, data flows, and build sequences