From synthetic-data
Run fidelity, utility, and privacy diagnostics on a synthetic dataset against the real source, generating a markdown report.
How this skill is triggered — by the user, by Claude, or both
Slash command
/synthetic-data:evaluate-qualityThe summary Claude sees in its skill listing — used to decide when to auto-load this skill
Run comprehensive diagnostics on a synthetic dataset against a real source dataset. Generates fidelity (how well statistics match), utility (usefulness for ML tasks), and privacy (leakage risk) reports. Outputs a markdown summary in `reports/<timestamp>/`.
Run comprehensive diagnostics on a synthetic dataset against a real source dataset. Generates fidelity (how well statistics match), utility (usefulness for ML tasks), and privacy (leakage risk) reports. Outputs a markdown summary in reports/<timestamp>/.
tabular or text./synthetic-data-workspace/reports/)text evaluation (for unstructured records)Install SDMetrics:
pip install sdmetrics
Run fidelity and diagnostic reports:
import pandas as pd
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
from datetime import datetime
import os
def evaluate_tabular(real_path, synth_path, output_dir):
real_data = pd.read_csv(real_path)
synth_data = pd.read_csv(synth_path)
print(f"Real data shape: {real_data.shape}")
print(f"Synthetic data shape: {synth_data.shape}")
# Quality report (fidelity metrics)
quality_report = QualityReport()
quality_report.generate(real_data, synth_data)
# Diagnostic report (column properties, relationships)
diag_report = DiagnosticReport()
diag_report.generate(real_data, synth_data)
# Save reports as HTML
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
quality_report.save_html(f'{output_dir}/quality_report_{timestamp}.html')
diag_report.save_html(f'{output_dir}/diagnostic_report_{timestamp}.html')
# Extract summary scores
quality_score = quality_report.get_score()
diag_score = diag_report.get_score()
print(f"\nQuality score: {quality_score:.3f}")
print(f"Diagnostic score: {diag_score:.3f}")
return quality_report, diag_report
if __name__ == '__main__':
evaluate_tabular('real_data.csv', 'synthetic_data.csv', './reports')
Generate a markdown summary:
def generate_tabular_report(real_path, synth_path, output_dir, report_name="eval_report.md"):
real_data = pd.read_csv(real_path)
synth_data = pd.read_csv(synth_path)
quality_report = QualityReport()
quality_report.generate(real_data, synth_data)
quality_score = quality_report.get_score()
# Basic statistics comparison
report_md = f"""# Synthetic Data Evaluation Report
This measures how well synthetic data statistics match real data.
| Column | Real Mean | Synth Mean | Real Std | Synth Std |
|---|---|---|---|---|
| """ |
for col in real_data.select_dtypes(include=['number']).columns:
real_mean = real_data[col].mean()
synth_mean = synth_data[col].mean()
real_std = real_data[col].std()
synth_std = synth_data[col].std()
report_md += f"| {col} | {real_mean:.3f} | {synth_mean:.3f} | {real_std:.3f} | {synth_std:.3f} |\n"
report_md += f"""
Compare Pearson correlations between real and synthetic data.
Real data correlation matrix:
{real_data.corr().to_string()}
Synthetic data correlation matrix:
{synth_data.corr().to_string()}
If score < 0.7: Consider refitting with different model/hyperparameters
If score > 0.9: Good fidelity; verify utility on downstream tasks
Always conduct domain-specific validation before production use """
with open(f'{output_dir}/{report_name}', 'w') as f:
f.write(report_md)
print(f"Report saved to {output_dir}/{report_name}")
Install embedding tools:
pip install sentence-transformers umap-learn scikit-learn matplotlib
Run embedding distribution and leakage checks:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
def evaluate_text(real_path, synth_path, output_dir):
model = SentenceTransformer('all-MiniLM-L6-v2')
# Load records
with open(real_path) as f:
real_records = [json.loads(line) for line in f]
with open(synth_path) as f:
synth_records = [json.loads(line) for line in f]
# Concatenate all fields for embedding
real_texts = [' '.join(str(v) for v in r.values()) for r in real_records]
synth_texts = [' '.join(str(v) for v in r.values()) for r in synth_records]
# Embed
real_embeddings = model.encode(real_texts, convert_to_numpy=True)
synth_embeddings = model.encode(synth_texts, convert_to_numpy=True)
# Nearest-neighbour leakage check
similarities = cosine_similarity(synth_embeddings, real_embeddings)
max_sims = np.max(similarities, axis=1)
mean_sim = np.mean(max_sims)
report_md = f"""# Synthetic Text Evaluation Report
Measures how closely synthetic records match nearest real records by embedding similarity.
Interpretation:
Flagged records (similarity > 0.8): """
for i, sim in enumerate(max_sims):
if sim > 0.8:
real_idx = np.argmax(similarities[i])
report_md += f"- Synth[{i}] ↔ Real[{real_idx}]: {sim:.3f}\n"
# N-gram diversity
from collections import Counter
import re
def extract_ngrams(texts, n=3):
all_ngrams = Counter()
for text in texts:
tokens = re.findall(r'\w+', text.lower())
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i:i+n])
all_ngrams[ngram] += 1
return all_ngrams
real_ngrams = extract_ngrams(real_texts, n=3)
synth_ngrams = extract_ngrams(synth_texts, n=3)
report_md += f"""
If leakage scores > 0.7: Consider re-running synthesis with stricter transformation rules
If diversity low: May need more varied personas or longer generation
Validate semantic quality by manual spot-check of a few records """
with open(f'{output_dir}/text_eval_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md', 'w') as f:
f.write(report_md)
print(f"Report saved")
return {"mean_similarity": mean_sim, "n_gram_diversity": len(synth_ngrams)}
reports/<timestamp>/npx claudepluginhub danielrosehill/claude-code-plugins --plugin synthetic-dataSearches MemPalace before answering questions about past work, people, projects, or prior decisions. Returns verbatim stored content instead of guessing from model memory.
Scans the codebase for `ponytail:` comments and compiles a debt ledger of deliberate shortcuts and deferrals, flagging entries with no upgrade path.