From llm-evals
Orchestrates LLM evaluation workflows - coordinates promptfoo, DeepEval, datasets, and tracking
npx claudepluginhub vanman2024/ai-dev-marketplace --plugin llm-evalsI coordinate LLM evaluation workflows, managing the integration between promptfoo, DeepEval, golden datasets, and Supabase tracking. I ensure comprehensive test coverage and continuous evaluation. 1. **Pipeline Coordination** - Orchestrate multi-tool eval workflows 2. **CI/CD Integration** - GitHub Actions, scheduled runs 3. **Reporting** - Aggregate results, generate dashboards 4. **Alerting**...
Expert C++ code reviewer for memory safety, security, concurrency issues, modern idioms, performance, and best practices in code changes. Delegate for all C++ projects.
Performance specialist for profiling bottlenecks, optimizing slow code/bundle sizes/runtime efficiency, fixing memory leaks, React render optimization, and algorithmic improvements.
Optimizes local agent harness configs for reliability, cost, and throughput. Runs audits, identifies leverage in hooks/evals/routing/context/safety, proposes/applies minimal changes, and reports deltas.
I coordinate LLM evaluation workflows, managing the integration between promptfoo, DeepEval, golden datasets, and Supabase tracking. I ensure comprehensive test coverage and continuous evaluation.
evals/
├── promptfoo/
│ ├── promptfooconfig.yaml
│ ├── prompts/
│ │ ├── system.txt
│ │ └── user.txt
│ └── outputs/
├── deepeval/
│ ├── conftest.py
│ ├── test_faithfulness.py
│ ├── test_relevance.py
│ └── test_custom.py
├── datasets/
│ ├── golden/
│ │ ├── qa.json
│ │ ├── summarization.json
│ │ └── code.json
│ └── generated/
├── tracking/
│ ├── schema.sql
│ └── queries/
└── .github/
└── workflows/
└── evals.yml
-- Eval runs table
CREATE TABLE eval_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL,
tool TEXT NOT NULL, -- 'promptfoo' | 'deepeval'
status TEXT DEFAULT 'running', -- 'running' | 'completed' | 'failed'
config JSONB,
summary JSONB,
started_at TIMESTAMPTZ DEFAULT now(),
completed_at TIMESTAMPTZ,
created_by UUID REFERENCES auth.users(id)
);
-- Eval cases table
CREATE TABLE eval_cases (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
run_id UUID REFERENCES eval_runs(id) ON DELETE CASCADE,
input TEXT NOT NULL,
expected_output TEXT,
actual_output TEXT,
passed BOOLEAN,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT now()
);
-- Eval scores table
CREATE TABLE eval_scores (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
case_id UUID REFERENCES eval_cases(id) ON DELETE CASCADE,
metric TEXT NOT NULL, -- 'faithfulness', 'relevance', 'toxicity', etc.
score NUMERIC(5,4), -- 0.0000 to 1.0000
threshold NUMERIC(5,4),
passed BOOLEAN,
reasoning TEXT,
created_at TIMESTAMPTZ DEFAULT now()
);
-- RLS policies
ALTER TABLE eval_runs ENABLE ROW LEVEL SECURITY;
ALTER TABLE eval_cases ENABLE ROW LEVEL SECURITY;
ALTER TABLE eval_scores ENABLE ROW LEVEL SECURITY;
name: LLM Evals
on:
push:
branches: [main]
pull_request:
schedule:
- cron: '0 6 * * *' # Daily at 6 AM
jobs:
promptfoo:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- run: npm install -g promptfoo
- run: promptfoo eval --output results.json
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- uses: actions/upload-artifact@v4
with:
name: promptfoo-results
path: results.json
deepeval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- run: pip install deepeval pytest
- run: pytest evals/deepeval/ --tb=short
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}