Instrument error handling, retries, fallbacks, and failure patterns
Automatically instruments error handling, retries, and fallbacks to track failure patterns and recovery rates. Use this when wrapping API calls or LLM invocations to capture transient vs. permanent errors, retry attempts, and fallback behavior for observability.
/plugin marketplace add nexus-labs-automation/agent-observability/plugin install nexus-labs-automation-agent-observability@nexus-labs-automation/agent-observabilityThis skill inherits all available tools. When active, it can use any tool Claude has access to.
Instrument error handling to understand failure patterns and recovery behavior.
Error observability answers:
TRANSIENT_ERRORS = [
"RateLimitError",
"TimeoutError",
"ServiceUnavailable",
"ConnectionError",
]
PERMANENT_ERRORS = [
"InvalidRequestError",
"AuthenticationError",
"ContentPolicyViolation",
"ContextLengthExceeded",
]
def classify_error(error: Exception) -> str:
error_type = type(error).__name__
if error_type in TRANSIENT_ERRORS:
return "transient"
elif error_type in PERMANENT_ERRORS:
return "permanent"
return "unknown"
# Error identification (P0)
span.set_attribute("error.type", "RateLimitError")
span.set_attribute("error.message", "Rate limit exceeded")
span.set_attribute("error.category", "transient")
span.set_attribute("error.source", "llm_provider")
# Provider context (P1)
span.set_attribute("error.provider", "anthropic")
span.set_attribute("error.model", "claude-3-opus")
span.set_attribute("error.status_code", 429)
span.set_attribute("error.request_id", "req_abc123")
# Timing context (P1)
span.set_attribute("error.retry_after_ms", 60000)
span.set_attribute("error.occurred_at_step", 3)
span.set_attribute("error.time_into_request_ms", 2500)
# Impact (P2)
span.set_attribute("error.tokens_wasted", 1500) # Tokens sent before failure
span.set_attribute("error.cost_wasted_usd", 0.015)
# Retry tracking (P0)
span.set_attribute("retry.attempt", 2)
span.set_attribute("retry.max_attempts", 3)
span.set_attribute("retry.strategy", "exponential_backoff")
# Timing (P1)
span.set_attribute("retry.delay_ms", 2000)
span.set_attribute("retry.total_wait_ms", 3500)
span.set_attribute("retry.jitter_ms", 150)
# Outcome (P0)
span.set_attribute("retry.success", True)
span.set_attribute("retry.final_attempt", 2)
span.set_attribute("retry.exhausted", False)
from functools import wraps
from langfuse.decorators import observe
import time
def with_retry(
max_attempts: int = 3,
base_delay: float = 1.0,
max_delay: float = 60.0,
exponential_base: float = 2.0,
):
def decorator(func):
@wraps(func)
@observe(name=f"{func.__name__}.with_retry")
def wrapper(*args, **kwargs):
span = get_current_span()
span.set_attribute("retry.max_attempts", max_attempts)
span.set_attribute("retry.strategy", "exponential_backoff")
last_error = None
total_wait = 0
for attempt in range(1, max_attempts + 1):
try:
span.set_attribute("retry.attempt", attempt)
result = func(*args, **kwargs)
span.set_attribute("retry.success", True)
span.set_attribute("retry.final_attempt", attempt)
return result
except Exception as e:
last_error = e
span.set_attribute("error.type", type(e).__name__)
span.set_attribute("error.category", classify_error(e))
if classify_error(e) == "permanent":
span.set_attribute("retry.exhausted", False)
span.set_attribute("retry.abort_reason", "permanent_error")
raise
if attempt < max_attempts:
delay = min(
base_delay * (exponential_base ** (attempt - 1)),
max_delay
)
total_wait += delay
span.add_event("retry.waiting", {"delay_ms": delay * 1000})
time.sleep(delay)
span.set_attribute("retry.success", False)
span.set_attribute("retry.exhausted", True)
span.set_attribute("retry.total_wait_ms", total_wait * 1000)
raise last_error
return wrapper
return decorator
@with_retry(max_attempts=3)
def call_llm(messages):
return client.messages.create(messages=messages)
# Fallback span attributes
span.set_attribute("fallback.triggered", True)
span.set_attribute("fallback.reason", "primary_model_unavailable")
span.set_attribute("fallback.from_model", "claude-3-opus")
span.set_attribute("fallback.to_model", "claude-3-sonnet")
span.set_attribute("fallback.quality_impact", "reduced")
# Fallback chain
span.set_attribute("fallback.chain", ["opus", "sonnet", "haiku"])
span.set_attribute("fallback.chain_position", 2)
# Rate limit specific attributes
span.set_attribute("rate_limit.type", "tokens_per_minute")
span.set_attribute("rate_limit.limit", 100000)
span.set_attribute("rate_limit.remaining", 0)
span.set_attribute("rate_limit.reset_at", "2024-01-15T10:01:00Z")
span.set_attribute("rate_limit.retry_after_ms", 45000)
# Proactive rate limiting
span.set_attribute("rate_limit.preemptive_wait", True)
span.set_attribute("rate_limit.tokens_queued", 5000)
from enum import Enum
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, reject requests
HALF_OPEN = "half_open" # Testing recovery
# Circuit breaker attributes
span.set_attribute("circuit.state", "open")
span.set_attribute("circuit.failure_count", 5)
span.set_attribute("circuit.failure_threshold", 5)
span.set_attribute("circuit.last_failure_at", timestamp)
span.set_attribute("circuit.opens_at", timestamp)
span.set_attribute("circuit.half_open_attempts", 0)
Track error patterns:
# Per-session error summary
span.set_attribute("session.total_errors", 3)
span.set_attribute("session.transient_errors", 2)
span.set_attribute("session.permanent_errors", 1)
span.set_attribute("session.retry_success_rate", 0.67)
# Per-provider health
span.set_attribute("provider.health", "degraded")
span.set_attribute("provider.error_rate_1h", 0.05)
span.set_attribute("provider.avg_latency_1h_ms", 2500)
from langchain.chat_models import ChatAnthropic
from langfuse.callback import CallbackHandler
llm = ChatAnthropic(
model="claude-3-opus",
max_retries=3,
request_timeout=30,
)
# Callbacks capture retry behavior
handler = CallbackHandler()
response = llm.invoke(messages, config={"callbacks": [handler]})
from tenacity import retry, stop_after_attempt, wait_exponential
from langfuse.decorators import observe
@observe(name="llm.call")
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=60),
)
def call_with_tenacity(messages):
return client.messages.create(messages=messages)
llm-call-tracing - LLM error contexttool-call-tracking - Tool error handlingCreating algorithmic art using p5.js with seeded randomness and interactive parameter exploration. Use this when users request creating art using code, generative art, algorithmic art, flow fields, or particle systems. Create original algorithmic art rather than copying existing artists' work to avoid copyright violations.
Applies Anthropic's official brand colors and typography to any sort of artifact that may benefit from having Anthropic's look-and-feel. Use it when brand colors or style guidelines, visual formatting, or company design standards apply.
Create beautiful visual art in .png and .pdf documents using design philosophy. You should use this skill when the user asks to create a poster, piece of art, design, or other static piece. Create original visual designs, never copying existing artists' work to avoid copyright violations.