Help us improve
Share bugs, ideas, or general feedback.
From site-reliability-engineering
Provides SRE templates for SLOs, error budgets with Prometheus, and JavaScript patterns like circuit breakers and exponential backoff retries for reliable distributed systems.
npx claudepluginhub thebushidocollective/han --plugin do-site-reliability-engineeringHow this skill is triggered — by the user, by Claude, or both
Slash command
/site-reliability-engineering:sre-reliabilityThe summary Claude sees in its skill listing — used to decide when to auto-load this skill
Building reliable and scalable distributed systems.
Defines SLOs/SLIs, error budgets, incident response procedures, capacity models, monitoring configurations, and automation scripts for production systems. Use for SRE tasks like chaos engineering, toil reduction, and reliability at scale.
Defines and implements SLIs, SLOs, and error budgets for service reliability using PromQL queries for availability/latency, YAML configs, and downtime calculations. Useful for reliability targets and alerts.
Designs Service Level Objectives (SLOs) with SLIs, targets, alerting thresholds, and error budgets following Google SRE best practices. Use for defining reliability targets, calculating error budgets, or establishing service indicators.
Share bugs, ideas, or general feedback.
Building reliable and scalable distributed systems.
SLI: Availability = successful requests / total requests
SLO: 99.9% availability (measured over 30 days)
Error Budget: 0.1% = 43 minutes downtime per month
# API Service SLO
## Availability SLO
**Target**: 99.9% of requests succeed (measured over 30 days)
**SLI Definition**:
- Success: HTTP 200-399 responses
- Failure: HTTP 500-599 responses, timeouts
- Excluded: HTTP 400-499 (client errors)
**Measurement**:
```prometheus
sum(rate(http_requests_total{status=~"[23].."}[30d]))
/
sum(rate(http_requests_total{status!~"4.."}[30d]))
Error Budget: 0.1% = ~43 minutes/month
Consequences:
## Error Budgets
### Tracking
```prometheus
# Error budget remaining
error_budget_remaining = 1 - (
(1 - current_sli) / (1 - slo_target)
)
# Example: 99.9% SLO, currently at 99.95%
# Error budget remaining = 1 - ((1 - 0.9995) / (1 - 0.999))
# = 1 - (0.0005 / 0.001) = 0.5 (50% remaining)
# How fast are we consuming error budget?
error_budget_burn_rate =
(1 - current_sli_1h) / (1 - slo_target)
# Alert if burning budget 10x faster than sustainable
- alert: FastErrorBudgetBurn
expr: error_budget_burn_rate > 10
for: 1h
Error Budget > 75%: Ship aggressively
Error Budget 25-75%: Normal velocity
Error Budget < 25%: Slow down, increase testing
Error Budget = 0%: Feature freeze, reliability only
class CircuitBreaker {
constructor({ threshold = 5, timeout = 60000 }) {
this.state = 'CLOSED';
this.failures = 0;
this.threshold = threshold;
this.timeout = timeout;
}
async call(fn) {
if (this.state === 'OPEN') {
if (Date.now() - this.openedAt > this.timeout) {
this.state = 'HALF_OPEN';
} else {
throw new Error('Circuit breaker is OPEN');
}
}
try {
const result = await fn();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
onSuccess() {
this.failures = 0;
this.state = 'CLOSED';
}
onFailure() {
this.failures++;
if (this.failures >= this.threshold) {
this.state = 'OPEN';
this.openedAt = Date.now();
}
}
}
async function retryWithBackoff(fn, maxRetries = 3) {
for (let i = 0; i < maxRetries; i++) {
try {
return await fn();
} catch (error) {
if (i === maxRetries - 1) throw error;
const delay = Math.min(1000 * Math.pow(2, i), 10000);
const jitter = Math.random() * 1000;
await sleep(delay + jitter);
}
}
}
class TokenBucket {
constructor({ capacity, refillRate }) {
this.capacity = capacity;
this.tokens = capacity;
this.refillRate = refillRate;
this.lastRefill = Date.now();
}
tryConsume(tokens = 1) {
this.refill();
if (this.tokens >= tokens) {
this.tokens -= tokens;
return true;
}
return false;
}
refill() {
const now = Date.now();
const elapsed = (now - this.lastRefill) / 1000;
const tokensToAdd = elapsed * this.refillRate;
this.tokens = Math.min(
this.capacity,
this.tokens + tokensToAdd
);
this.lastRefill = now;
}
}
class Bulkhead {
constructor({ maxConcurrent }) {
this.maxConcurrent = maxConcurrent;
this.current = 0;
this.queue = [];
}
async execute(fn) {
while (this.current >= this.maxConcurrent) {
await new Promise(resolve => this.queue.push(resolve));
}
this.current++;
try {
return await fn();
} finally {
this.current--;
if (this.queue.length > 0) {
const resolve = this.queue.shift();
resolve();
}
}
}
}
async function getRecommendations(userId) {
try {
// Try personalized recommendations
return await recommendationService.getPersonalized(userId, {
timeout: 500, // Fail fast
});
} catch (error) {
logger.warn('Personalized recommendations failed, falling back', {
userId,
error: error.message,
});
try {
// Fall back to popular items
return await cache.get('popular_items');
} catch (fallbackError) {
// Final fallback
return DEFAULT_RECOMMENDATIONS;
}
}
}
# Current utilization
current_utilization =
sum(rate(http_requests_total[5m]))
/ capacity_requests_per_second
# Alert when approaching capacity
- alert: HighUtilization
expr: current_utilization > 0.80
for: 10m
Current QPS: 1,000
Growth rate: 20% per month
Capacity per instance: 100 QPS
Current instances: 12
In 6 months:
Projected QPS: 1,000 * (1.20)^6 = 2,986
Instances needed: 2,986 / 100 = 30
// k6 load test
import http from 'k6/http';
import { check, sleep } from 'k6';
export const options = {
stages: [
{ duration: '2m', target: 100 }, // Ramp up
{ duration: '5m', target: 100 }, // Steady state
{ duration: '2m', target: 200 }, // Spike
{ duration: '5m', target: 200 }, // Higher steady
{ duration: '2m', target: 0 }, // Ramp down
],
thresholds: {
http_req_duration: ['p(95)<500'], // 95% under 500ms
http_req_failed: ['rate<0.01'], // Less than 1% errors
},
};
export default function () {
const res = http.get('https://api.example.com/endpoint');
check(res, {
'status is 200': (r) => r.status === 200,
'response time < 500ms': (r) => r.timings.duration < 500,
});
sleep(1);
}
// Inject latency
function withLatencyInjection(fn, { probability = 0.1, delayMs = 1000 }) {
return async (...args) => {
if (Math.random() < probability) {
await sleep(delayMs);
}
return fn(...args);
};
}
// Inject failures
function withFailureInjection(fn, { probability = 0.05 }) {
return async (...args) => {
if (Math.random() < probability) {
throw new Error('Injected failure');
}
return fn(...args);
};
}