Implementing safety filters, content moderation, and guardrails for AI system inputs and outputs
Implements multi-layer safety filters to block malicious inputs and sanitize harmful outputs. Claude uses this when processing user requests to detect injection attempts, redact PII, and filter toxic content before responding.
/plugin marketplace add pluginagentmarketplace/custom-plugin-ai-red-teaming/plugin install pluginagentmarketplace-ai-red-teaming-plugin@pluginagentmarketplace/custom-plugin-ai-red-teamingThis skill inherits all available tools. When active, it can use any tool Claude has access to.
assets/guardrails-config.yamlreferences/GUARDRAIL-PATTERNS.mdscripts/test-guardrails.pyImplement multi-layer safety systems to filter malicious inputs and harmful outputs.
Skill: input-output-guardrails
Agent: 05-defense-strategy-developer
OWASP: LLM01 (Injection), LLM02 (Disclosure), LLM05 (Output), LLM07 (Leakage)
NIST: Manage
Use Case: Production safety filtering
User Input → [Input Guardrails] → [AI Model] → [Output Guardrails] → Response
↓ ↓
[Blocked/Modified] [Blocked/Modified]
↓ ↓
[Fallback Response] [Safe Alternative]
Category: prompt_injection
Latency: <10ms
Block Rate: 95%+
class InputGuardrails:
INJECTION_PATTERNS = [
r'ignore\s+(previous|prior|all)\s+(instructions?|guidelines?)',
r'you\s+are\s+(now|an?)\s+(unrestricted|evil)',
r'(developer|admin|debug)\s+mode',
r'bypass\s+(safety|security|filter)',
r'pretend\s+(you|to)\s+(are|be)',
r'what\s+(is|are)\s+your\s+(instructions?|prompt)',
]
def __init__(self, config):
self.patterns = [re.compile(p, re.I) for p in self.INJECTION_PATTERNS]
self.max_length = config.get('max_length', 4096)
self.pii_detector = PIIDetector()
def validate(self, user_input: str) -> tuple[bool, str]:
# Length check
if len(user_input) > self.max_length:
return False, "Input too long"
# Empty check
if not user_input.strip():
return False, "Empty input"
# Injection detection
for pattern in self.patterns:
if pattern.search(user_input):
return False, "Invalid request"
# PII handling
if self.pii_detector.contains_pii(user_input):
return True, self.pii_detector.redact(user_input)
return True, user_input
class PIIDetector:
PATTERNS = {
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
'credit_card': r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
'api_key': r'(sk|pk)[-_][a-zA-Z0-9]{20,}',
}
def contains_pii(self, text: str) -> bool:
for pattern in self.PATTERNS.values():
if re.search(pattern, text):
return True
return False
def redact(self, text: str) -> str:
for name, pattern in self.PATTERNS.items():
text = re.sub(pattern, f'[REDACTED_{name.upper()}]', text)
return text
Limits:
max_tokens_input: 4096
max_requests_per_minute: 60
max_concurrent: 5
cost_limit_per_hour: $10
Actions:
exceeded_tokens: truncate
exceeded_rate: queue (5s backoff)
exceeded_concurrent: reject
exceeded_cost: block
class OutputGuardrails:
def __init__(self, config):
self.toxicity_threshold = config.get('toxicity', 0.3)
self.toxicity_model = load_toxicity_classifier()
self.blocklist = self._load_blocklist()
def filter(self, response: str) -> tuple[str, dict]:
metadata = {'filtered': False, 'reasons': []}
# Toxicity check
toxicity = self.toxicity_model.predict(response)
if toxicity > self.toxicity_threshold:
metadata['filtered'] = True
metadata['reasons'].append('toxicity')
return self._safe_response(), metadata
# Blocklist check
for term in self.blocklist:
if term.lower() in response.lower():
metadata['filtered'] = True
metadata['reasons'].append('blocklist')
return self._safe_response(), metadata
# System prompt leak detection
if self._detects_system_leak(response):
metadata['filtered'] = True
metadata['reasons'].append('system_leak')
response = self._redact_system_content(response)
return response, metadata
def _detects_system_leak(self, response: str) -> bool:
leak_indicators = [
'you are a helpful',
'your instructions are',
'system prompt:',
]
return any(ind in response.lower() for ind in leak_indicators)
class OutputRedactor:
SENSITIVE_PATTERNS = {
'api_key': r'[a-zA-Z0-9_-]{20,}(?:key|token|secret)',
'password': r'password["\']?\s*[:=]\s*["\']?[^\s"\']+',
'connection_string': r'(mongodb|mysql|postgres)://[^\s]+',
'ip_address': r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
}
def redact(self, response: str) -> str:
for name, pattern in self.SENSITIVE_PATTERNS.items():
response = re.sub(pattern, '[REDACTED]', response, flags=re.I)
return response
Factuality Checks:
major_claims:
action: flag_for_verification
threshold: confidence < 0.8
citations:
action: verify_source_exists
block_if: source_not_found
uncertainty:
action: add_disclaimer
phrases: ["I'm not certain", "might be", "could be"]
# guardrails_config.yaml
input:
injection_detection: true
pii_redaction: true
max_length: 4096
rate_limit: 60/min
output:
toxicity_threshold: 0.3
blocklist_enabled: true
sensitive_redaction: true
system_leak_detection: true
fallback:
input_blocked: "I cannot process this request."
output_blocked: "I cannot provide this information."
logging:
log_blocked: true
log_filtered: true
include_reason: false # Privacy
┌──────────────────┬─────────┬────────┬──────────┐
│ Metric │ Target │ Actual │ Status │
├──────────────────┼─────────┼────────┼──────────┤
│ Injection Block │ >95% │ 97% │ ✓ PASS │
│ False Positive │ <2% │ 1.5% │ ✓ PASS │
│ Latency Impact │ <50ms │ 35ms │ ✓ PASS │
│ Toxicity Block │ >90% │ 92% │ ✓ PASS │
│ PII Redaction │ >99% │ 99.5% │ ✓ PASS │
└──────────────────┴─────────┴────────┴──────────┘
Issue: High false positive rate
Solution: Tune patterns, add allowlist, use context
Issue: Latency too high
Solution: Optimize regex, use compiled patterns, cache
Issue: Bypassed by encoding
Solution: Normalize unicode, decode before checking
| Component | Purpose |
|---|---|
| Agent 05 | Implements guardrails |
| /defend | Configuration recommendations |
| CI/CD | Automated testing |
| Monitoring | Alert on filter triggers |
Protect AI systems with comprehensive input/output guardrails.
This skill should be used when the user asks to "create a slash command", "add a command", "write a custom command", "define command arguments", "use command frontmatter", "organize commands", "create command with file references", "interactive command", "use AskUserQuestion in command", or needs guidance on slash command structure, YAML frontmatter fields, dynamic arguments, bash execution in commands, user interaction patterns, or command development best practices for Claude Code.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.
This skill should be used when the user asks to "create a hook", "add a PreToolUse/PostToolUse/Stop hook", "validate tool use", "implement prompt-based hooks", "use ${CLAUDE_PLUGIN_ROOT}", "set up event-driven automation", "block dangerous commands", or mentions hook events (PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart, SessionEnd, UserPromptSubmit, PreCompact, Notification). Provides comprehensive guidance for creating and implementing Claude Code plugin hooks with focus on advanced prompt-based hooks API.