LLM evaluation frameworks, benchmarks, and quality metrics for production systems.
Evaluates LLM outputs using RAGAS, benchmarks, and custom quality metrics.
/plugin marketplace add pluginagentmarketplace/custom-plugin-ai-engineer/plugin install pluginagentmarketplace-ai-engineer-plugin@pluginagentmarketplace/custom-plugin-ai-engineerThis skill inherits all available tools. When active, it can use any tool Claude has access to.
assets/evaluation_config.yamlreferences/METRICS_GUIDE.mdscripts/llm_evaluator.pyMeasure and improve LLM quality systematically.
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from datasets import Dataset
# Prepare evaluation data
eval_data = {
"question": ["What is machine learning?"],
"answer": ["ML is a subset of AI that learns from data."],
"contexts": [["Machine learning is a field of AI..."]],
"ground_truth": ["Machine learning is AI that learns patterns."]
}
dataset = Dataset.from_dict(eval_data)
# Run evaluation
results = evaluate(
dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall
]
)
print(results)
from langchain.evaluation import load_evaluator
# Criteria-based evaluation
evaluator = load_evaluator("criteria", criteria="helpfulness")
result = evaluator.evaluate_strings(
prediction="Paris is the capital of France.",
input="What is the capital of France?"
)
print(f"Score: {result['score']}, Reasoning: {result['reasoning']}")
from evaluate import load
import numpy as np
class TextMetrics:
def __init__(self):
self.bleu = load("bleu")
self.rouge = load("rouge")
self.bertscore = load("bertscore")
def evaluate(self, predictions: list, references: list) -> dict:
metrics = {}
# BLEU - Precision-based n-gram overlap
bleu_result = self.bleu.compute(
predictions=predictions,
references=[[r] for r in references]
)
metrics['bleu'] = bleu_result['bleu']
# ROUGE - Recall-based overlap
rouge_result = self.rouge.compute(
predictions=predictions,
references=references
)
metrics['rouge1'] = rouge_result['rouge1']
metrics['rougeL'] = rouge_result['rougeL']
# BERTScore - Semantic similarity
bert_result = self.bertscore.compute(
predictions=predictions,
references=references,
lang="en"
)
metrics['bertscore_f1'] = np.mean(bert_result['f1'])
return metrics
class RAGMetrics:
def __init__(self, llm):
self.llm = llm
def faithfulness(self, answer: str, contexts: list) -> float:
"""Check if answer is supported by retrieved contexts."""
prompt = f"""Given the following context and answer, determine if the answer
is fully supported by the context.
Context: {' '.join(contexts)}
Answer: {answer}
Score from 0 (not supported) to 1 (fully supported):"""
response = self.llm.generate(prompt)
return float(response.strip())
def relevance(self, question: str, answer: str) -> float:
"""Check if answer is relevant to the question."""
prompt = f"""Rate how relevant this answer is to the question.
Question: {question}
Answer: {answer}
Score from 0 (irrelevant) to 1 (highly relevant):"""
response = self.llm.generate(prompt)
return float(response.strip())
def context_precision(self, question: str, contexts: list) -> float:
"""Check if retrieved contexts are relevant to question."""
relevant_count = 0
for ctx in contexts:
prompt = f"""Is this context relevant to answering the question?
Question: {question}
Context: {ctx}
Answer Yes or No:"""
if "yes" in self.llm.generate(prompt).lower():
relevant_count += 1
return relevant_count / len(contexts)
class HallucinationDetector:
def __init__(self, llm, knowledge_base=None):
self.llm = llm
self.knowledge_base = knowledge_base
def detect(self, claim: str, source: str = None) -> dict:
"""Detect potential hallucinations in a claim."""
results = {
'claim': claim,
'is_hallucination': False,
'confidence': 0.0,
'reason': ''
}
# Check against source if provided
if source:
prompt = f"""Determine if this claim is supported by the source.
Source: {source}
Claim: {claim}
Is the claim fully supported? Answer with:
SUPPORTED, PARTIALLY_SUPPORTED, or NOT_SUPPORTED
Reason:"""
response = self.llm.generate(prompt)
if "NOT_SUPPORTED" in response:
results['is_hallucination'] = True
results['confidence'] = 0.9
elif "PARTIALLY" in response:
results['confidence'] = 0.5
# Check for self-consistency
regenerations = [
self.llm.generate(f"Verify: {claim}")
for _ in range(3)
]
consistency = self._check_consistency(regenerations)
if consistency < 0.7:
results['is_hallucination'] = True
results['reason'] = 'Inconsistent across regenerations'
return results
from datasets import load_dataset
def evaluate_mmlu(model, tokenizer, subjects=None):
dataset = load_dataset("cais/mmlu", "all")
results = {}
for subject in subjects or dataset.keys():
correct = 0
total = 0
for example in dataset[subject]:
question = example['question']
choices = example['choices']
answer = example['answer']
# Format prompt
prompt = f"{question}\n"
for i, choice in enumerate(choices):
prompt += f"{chr(65+i)}. {choice}\n"
prompt += "Answer:"
# Get model prediction
response = model.generate(prompt)
predicted = response[0].upper()
if predicted == chr(65 + answer):
correct += 1
total += 1
results[subject] = correct / total
return results
def evaluate_humaneval(model):
from human_eval.data import read_problems
from human_eval.execution import check_correctness
problems = read_problems()
results = []
for task_id, problem in problems.items():
prompt = problem['prompt']
# Generate completions
completions = [model.generate(prompt) for _ in range(10)]
# Check correctness
for completion in completions:
result = check_correctness(problem, completion, timeout=10.0)
results.append(result['passed'])
pass_at_1 = sum(results[:len(problems)]) / len(problems)
return {'pass@1': pass_at_1}
from dataclasses import dataclass
from typing import List, Dict, Callable
@dataclass
class EvaluationConfig:
metrics: List[str]
sample_size: int = 100
confidence_level: float = 0.95
class LLMEvaluator:
def __init__(self, model, config: EvaluationConfig):
self.model = model
self.config = config
self.metrics_registry: Dict[str, Callable] = {}
def register_metric(self, name: str, func: Callable):
self.metrics_registry[name] = func
def evaluate(self, test_data: List[dict]) -> dict:
results = {metric: [] for metric in self.config.metrics}
for sample in test_data[:self.config.sample_size]:
prediction = self.model.generate(sample['input'])
for metric_name in self.config.metrics:
metric_func = self.metrics_registry[metric_name]
score = metric_func(
prediction=prediction,
reference=sample.get('expected'),
context=sample.get('context')
)
results[metric_name].append(score)
# Aggregate results
aggregated = {}
for metric, scores in results.items():
aggregated[metric] = {
'mean': np.mean(scores),
'std': np.std(scores),
'min': np.min(scores),
'max': np.max(scores)
}
return aggregated
class ABTester:
def __init__(self, model_a, model_b, evaluator):
self.model_a = model_a
self.model_b = model_b
self.evaluator = evaluator
def run_test(self, test_data: List[dict], metric: str) -> dict:
scores_a = []
scores_b = []
for sample in test_data:
# Get predictions from both models
pred_a = self.model_a.generate(sample['input'])
pred_b = self.model_b.generate(sample['input'])
# Evaluate
score_a = self.evaluator.evaluate_single(pred_a, sample)
score_b = self.evaluator.evaluate_single(pred_b, sample)
scores_a.append(score_a[metric])
scores_b.append(score_b[metric])
# Statistical test
from scipy import stats
t_stat, p_value = stats.ttest_rel(scores_a, scores_b)
return {
'model_a_mean': np.mean(scores_a),
'model_b_mean': np.mean(scores_b),
'improvement': (np.mean(scores_b) - np.mean(scores_a)) / np.mean(scores_a),
'p_value': p_value,
'significant': p_value < 0.05
}
| Metric | Range | Best For | Interpretation |
|---|---|---|---|
| BLEU | 0-1 | Translation | Higher = better n-gram match |
| ROUGE-L | 0-1 | Summarization | Higher = better recall |
| BERTScore | 0-1 | General | Higher = semantic similarity |
| Faithfulness | 0-1 | RAG | Higher = grounded in context |
| Perplexity | 1-∞ | Language model | Lower = better fluency |
| Pass@k | 0-1 | Code gen | Higher = more correct samples |
from tenacity import retry, stop_after_attempt
@retry(stop=stop_after_attempt(3))
def evaluate_with_retry(model_output, reference):
return evaluator.evaluate(model_output, reference)
def batch_evaluate(samples, batch_size=50):
results = []
for i in range(0, len(samples), batch_size):
batch = samples[i:i+batch_size]
results.extend([evaluate_with_retry(s) for s in batch])
return results
| Symptom | Cause | Solution |
|---|---|---|
| Inconsistent scores | High temperature | Set temp=0 for evaluator |
| Slow evaluation | No batching | Batch evaluations |
| Missing metrics | Wrong format | Check data schema |
def test_faithfulness_metric():
score = evaluate_faithfulness("Answer", ["Context"])
assert 0 <= score <= 1
This skill should be used when the user asks to "create a slash command", "add a command", "write a custom command", "define command arguments", "use command frontmatter", "organize commands", "create command with file references", "interactive command", "use AskUserQuestion in command", or needs guidance on slash command structure, YAML frontmatter fields, dynamic arguments, bash execution in commands, user interaction patterns, or command development best practices for Claude Code.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.
This skill should be used when the user asks to "create a hook", "add a PreToolUse/PostToolUse/Stop hook", "validate tool use", "implement prompt-based hooks", "use ${CLAUDE_PLUGIN_ROOT}", "set up event-driven automation", "block dangerous commands", or mentions hook events (PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart, SessionEnd, UserPromptSubmit, PreCompact, Notification). Provides comprehensive guidance for creating and implementing Claude Code plugin hooks with focus on advanced prompt-based hooks API.