Install
1
Install the plugin$
npx claudepluginhub latestaiagents/agent-skills --plugin agent-pluginWant just this skill?
Then install: npx claudepluginhub u/[userId]/[slug]
Description
Use this skill when implementing error handling for AI agents. Activate when the user needs agents to handle failures gracefully, implement retry strategies, design fault-tolerant agent systems, or build agents that can recover from errors without human intervention.
Tool Access
This skill uses the workspace's default tool permissions.
Skill Content
Agent Error Recovery
Design fault-tolerant agent systems that recover gracefully from failures.
When to Use
- Building production-grade agent systems
- Agents need to handle API failures
- Implementing autonomous error recovery
- Designing resilient multi-agent workflows
- Setting up monitoring and alerting
Error Classification
enum ErrorCategory {
// Transient - retry likely to succeed
RATE_LIMIT = 'rate_limit',
TIMEOUT = 'timeout',
NETWORK = 'network',
SERVICE_UNAVAILABLE = 'service_unavailable',
// Recoverable - different approach may work
INVALID_INPUT = 'invalid_input',
CONTEXT_OVERFLOW = 'context_overflow',
TOOL_FAILURE = 'tool_failure',
// Terminal - cannot proceed
AUTHENTICATION = 'authentication',
AUTHORIZATION = 'authorization',
NOT_FOUND = 'not_found',
VALIDATION = 'validation',
// Unknown
UNKNOWN = 'unknown'
}
interface AgentError {
category: ErrorCategory;
code: string;
message: string;
retryable: boolean;
context: Record<string, unknown>;
timestamp: Date;
stackTrace?: string;
}
function classifyError(error: Error): AgentError {
// Rate limits
if (error.message.includes('429') || error.message.includes('rate limit')) {
return {
category: ErrorCategory.RATE_LIMIT,
code: 'RATE_LIMITED',
message: error.message,
retryable: true,
context: { waitTime: extractWaitTime(error) },
timestamp: new Date()
};
}
// Timeouts
if (error.message.includes('timeout') || error.message.includes('ETIMEDOUT')) {
return {
category: ErrorCategory.TIMEOUT,
code: 'TIMEOUT',
message: error.message,
retryable: true,
context: {},
timestamp: new Date()
};
}
// Context overflow
if (error.message.includes('context length') || error.message.includes('too long')) {
return {
category: ErrorCategory.CONTEXT_OVERFLOW,
code: 'CONTEXT_OVERFLOW',
message: error.message,
retryable: true, // Can retry with truncated context
context: {},
timestamp: new Date()
};
}
// Default
return {
category: ErrorCategory.UNKNOWN,
code: 'UNKNOWN',
message: error.message,
retryable: false,
context: {},
timestamp: new Date(),
stackTrace: error.stack
};
}
Recovery Strategies
Strategy 1: Retry with Backoff
interface RetryConfig {
maxAttempts: number;
initialDelayMs: number;
maxDelayMs: number;
backoffMultiplier: number;
jitterMs: number;
}
async function retryWithBackoff<T>(
operation: () => Promise<T>,
config: RetryConfig
): Promise<T> {
let lastError: Error;
let delay = config.initialDelayMs;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error as Error;
const classified = classifyError(lastError);
// Don't retry non-retryable errors
if (!classified.retryable) {
throw lastError;
}
// Last attempt - throw
if (attempt === config.maxAttempts) {
throw lastError;
}
// Calculate delay with jitter
const jitter = Math.random() * config.jitterMs;
const waitTime = Math.min(delay + jitter, config.maxDelayMs);
console.log(`Attempt ${attempt} failed, retrying in ${waitTime}ms...`);
await sleep(waitTime);
// Increase delay for next attempt
delay *= config.backoffMultiplier;
}
}
throw lastError!;
}
Strategy 2: Circuit Breaker
enum CircuitState {
CLOSED = 'closed', // Normal operation
OPEN = 'open', // Failing, reject requests
HALF_OPEN = 'half_open' // Testing if recovered
}
class CircuitBreaker {
private state = CircuitState.CLOSED;
private failures = 0;
private lastFailure?: Date;
private successCount = 0;
constructor(
private config: {
failureThreshold: number;
resetTimeoutMs: number;
successThreshold: number;
}
) {}
async execute<T>(operation: () => Promise<T>): Promise<T> {
// Check if circuit should transition
this.checkState();
if (this.state === CircuitState.OPEN) {
throw new Error('Circuit breaker is OPEN');
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
private checkState(): void {
if (this.state === CircuitState.OPEN) {
const elapsed = Date.now() - this.lastFailure!.getTime();
if (elapsed >= this.config.resetTimeoutMs) {
this.state = CircuitState.HALF_OPEN;
this.successCount = 0;
}
}
}
private onSuccess(): void {
if (this.state === CircuitState.HALF_OPEN) {
this.successCount++;
if (this.successCount >= this.config.successThreshold) {
this.state = CircuitState.CLOSED;
this.failures = 0;
}
} else {
this.failures = 0;
}
}
private onFailure(): void {
this.failures++;
this.lastFailure = new Date();
if (this.failures >= this.config.failureThreshold) {
this.state = CircuitState.OPEN;
}
}
}
Strategy 3: Fallback Chain
interface FallbackOption<T> {
name: string;
execute: () => Promise<T>;
isApplicable: (error: AgentError) => boolean;
}
async function executeWithFallbacks<T>(
primary: () => Promise<T>,
fallbacks: FallbackOption<T>[]
): Promise<T> {
try {
return await primary();
} catch (error) {
const classified = classifyError(error as Error);
for (const fallback of fallbacks) {
if (fallback.isApplicable(classified)) {
console.log(`Primary failed, trying fallback: ${fallback.name}`);
try {
return await fallback.execute();
} catch (fallbackError) {
console.log(`Fallback ${fallback.name} also failed`);
continue;
}
}
}
// All fallbacks failed
throw error;
}
}
// Example usage
const result = await executeWithFallbacks(
() => callPrimaryAPI(),
[
{
name: 'backup_api',
execute: () => callBackupAPI(),
isApplicable: (e) => e.category === ErrorCategory.SERVICE_UNAVAILABLE
},
{
name: 'cached_response',
execute: () => getCachedResponse(),
isApplicable: (e) => e.category === ErrorCategory.TIMEOUT
},
{
name: 'simplified_request',
execute: () => callWithReducedParams(),
isApplicable: (e) => e.category === ErrorCategory.CONTEXT_OVERFLOW
}
]
);
Strategy 4: Self-Healing Agent
class SelfHealingAgent {
async execute(task: Task): Promise<Result> {
let attempt = 0;
const maxAttempts = 3;
while (attempt < maxAttempts) {
attempt++;
try {
return await this.runTask(task);
} catch (error) {
const classified = classifyError(error as Error);
// Can we heal?
const healingAction = this.determineHealingAction(classified);
if (!healingAction) {
throw error;
}
console.log(`Attempting self-healing: ${healingAction.description}`);
// Execute healing
await healingAction.execute();
// Modify task if needed
task = healingAction.modifyTask?.(task) || task;
}
}
throw new Error('Max healing attempts exceeded');
}
private determineHealingAction(error: AgentError): HealingAction | null {
switch (error.category) {
case ErrorCategory.CONTEXT_OVERFLOW:
return {
description: 'Truncating context to fit limits',
execute: async () => {},
modifyTask: (task) => ({
...task,
context: this.truncateContext(task.context)
})
};
case ErrorCategory.TOOL_FAILURE:
return {
description: 'Switching to alternative tool',
execute: async () => {
this.toolRouter.excludeTool(error.context.toolName as string);
}
};
case ErrorCategory.RATE_LIMIT:
return {
description: `Waiting ${error.context.waitTime}ms for rate limit`,
execute: async () => {
await sleep(error.context.waitTime as number);
}
};
default:
return null;
}
}
}
Error Recovery Workflow
┌─────────────────────────────────────────────────────────────┐
│ Error Occurs │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ Classify Error │
└─────────────────────────────────────────────────────────────┘
│
┌─────────────────┼─────────────────┐
▼ ▼ ▼
┌─────────┐ ┌─────────┐ ┌─────────┐
│Transient│ │Recoverable │Terminal │
└────┬────┘ └────┬────┘ └────┬────┘
│ │ │
▼ ▼ ▼
┌─────────┐ ┌─────────┐ ┌─────────┐
│ Retry │ │ Try │ │ Log & │
│ w/Backoff │ Fallback│ │ Alert │
└────┬────┘ └────┬────┘ └────┬────┘
│ │ │
▼ ▼ ▼
┌─────────────────────────────────────────┐
│ Success or Escalate │
└─────────────────────────────────────────┘
Monitoring and Alerting
interface ErrorMetrics {
totalErrors: number;
errorsByCategory: Map<ErrorCategory, number>;
errorRate: number; // errors per minute
recoveryRate: number; // successful recoveries
mttr: number; // mean time to recover (ms)
}
class ErrorMonitor {
private errors: AgentError[] = [];
private recoveries: { error: AgentError; recoveredAt: Date }[] = [];
recordError(error: AgentError): void {
this.errors.push(error);
this.checkAlerts();
}
recordRecovery(error: AgentError): void {
this.recoveries.push({ error, recoveredAt: new Date() });
}
private checkAlerts(): void {
const recentErrors = this.getRecentErrors(60000); // Last minute
// High error rate alert
if (recentErrors.length > 10) {
this.sendAlert({
severity: 'high',
message: `High error rate: ${recentErrors.length} errors in last minute`,
errors: recentErrors
});
}
// Repeated same error alert
const errorCounts = new Map<string, number>();
for (const e of recentErrors) {
const key = `${e.category}:${e.code}`;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
for (const [key, count] of errorCounts) {
if (count >= 5) {
this.sendAlert({
severity: 'medium',
message: `Repeated error: ${key} occurred ${count} times`,
errors: recentErrors.filter(e => `${e.category}:${e.code}` === key)
});
}
}
}
getMetrics(): ErrorMetrics {
const window = 5 * 60 * 1000; // 5 minutes
const recent = this.getRecentErrors(window);
const byCategory = new Map<ErrorCategory, number>();
for (const e of recent) {
byCategory.set(e.category, (byCategory.get(e.category) || 0) + 1);
}
const recentRecoveries = this.recoveries.filter(
r => Date.now() - r.recoveredAt.getTime() < window
);
const recoveryTimes = recentRecoveries.map(
r => r.recoveredAt.getTime() - r.error.timestamp.getTime()
);
return {
totalErrors: recent.length,
errorsByCategory: byCategory,
errorRate: recent.length / (window / 60000),
recoveryRate: recentRecoveries.length / Math.max(recent.length, 1),
mttr: recoveryTimes.length > 0
? recoveryTimes.reduce((a, b) => a + b, 0) / recoveryTimes.length
: 0
};
}
}
Best Practices
- Classify all errors - Know what you're dealing with
- Don't retry everything - Some errors won't recover
- Use exponential backoff - Avoid hammering failing services
- Set circuit breakers - Protect downstream systems
- Log everything - Debugging is hard without logs
- Have fallbacks - Always have a Plan B
- Alert on patterns - Single errors may be noise, patterns matter
- Test failure scenarios - Chaos engineering
Stats
Stars2
Forks0
Last CommitFeb 5, 2026
Similar Skills
brainstorming
7 files
You MUST use this before any creative work - creating features, building components, adding functionality, or modifying behavior. Explores user intent, requirements and design before implementation.
superpowers
102.8k