From synthetic-data
Run fidelity, utility, and privacy diagnostics on a synthetic dataset against the real source, generating a markdown report.
npx claudepluginhub danielrosehill/claude-code-plugins --plugin synthetic-dataThis skill uses the workspace's default tool permissions.
Run comprehensive diagnostics on a synthetic dataset against a real source dataset. Generates fidelity (how well statistics match), utility (usefulness for ML tasks), and privacy (leakage risk) reports. Outputs a markdown summary in `reports/<timestamp>/`.
Conducts multi-round deep research on GitHub repos via API and web searches, generating markdown reports with executive summaries, timelines, metrics, and Mermaid diagrams.
Share bugs, ideas, or general feedback.
Run comprehensive diagnostics on a synthetic dataset against a real source dataset. Generates fidelity (how well statistics match), utility (usefulness for ML tasks), and privacy (leakage risk) reports. Outputs a markdown summary in reports/<timestamp>/.
tabular or text./synthetic-data-workspace/reports/)text evaluation (for unstructured records)Install SDMetrics:
pip install sdmetrics
Run fidelity and diagnostic reports:
import pandas as pd
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
from datetime import datetime
import os
def evaluate_tabular(real_path, synth_path, output_dir):
real_data = pd.read_csv(real_path)
synth_data = pd.read_csv(synth_path)
print(f"Real data shape: {real_data.shape}")
print(f"Synthetic data shape: {synth_data.shape}")
# Quality report (fidelity metrics)
quality_report = QualityReport()
quality_report.generate(real_data, synth_data)
# Diagnostic report (column properties, relationships)
diag_report = DiagnosticReport()
diag_report.generate(real_data, synth_data)
# Save reports as HTML
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
quality_report.save_html(f'{output_dir}/quality_report_{timestamp}.html')
diag_report.save_html(f'{output_dir}/diagnostic_report_{timestamp}.html')
# Extract summary scores
quality_score = quality_report.get_score()
diag_score = diag_report.get_score()
print(f"\nQuality score: {quality_score:.3f}")
print(f"Diagnostic score: {diag_score:.3f}")
return quality_report, diag_report
if __name__ == '__main__':
evaluate_tabular('real_data.csv', 'synthetic_data.csv', './reports')
Generate a markdown summary:
def generate_tabular_report(real_path, synth_path, output_dir, report_name="eval_report.md"):
real_data = pd.read_csv(real_path)
synth_data = pd.read_csv(synth_path)
quality_report = QualityReport()
quality_report.generate(real_data, synth_data)
quality_score = quality_report.get_score()
# Basic statistics comparison
report_md = f"""# Synthetic Data Evaluation Report
This measures how well synthetic data statistics match real data.
| Column | Real Mean | Synth Mean | Real Std | Synth Std |
|---|---|---|---|---|
| """ |
for col in real_data.select_dtypes(include=['number']).columns:
real_mean = real_data[col].mean()
synth_mean = synth_data[col].mean()
real_std = real_data[col].std()
synth_std = synth_data[col].std()
report_md += f"| {col} | {real_mean:.3f} | {synth_mean:.3f} | {real_std:.3f} | {synth_std:.3f} |\n"
report_md += f"""
Compare Pearson correlations between real and synthetic data.
Real data correlation matrix:
{real_data.corr().to_string()}
Synthetic data correlation matrix:
{synth_data.corr().to_string()}
If score < 0.7: Consider refitting with different model/hyperparameters
If score > 0.9: Good fidelity; verify utility on downstream tasks
Always conduct domain-specific validation before production use """
with open(f'{output_dir}/{report_name}', 'w') as f:
f.write(report_md)
print(f"Report saved to {output_dir}/{report_name}")
Install embedding tools:
pip install sentence-transformers umap-learn scikit-learn matplotlib
Run embedding distribution and leakage checks:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
def evaluate_text(real_path, synth_path, output_dir):
model = SentenceTransformer('all-MiniLM-L6-v2')
# Load records
with open(real_path) as f:
real_records = [json.loads(line) for line in f]
with open(synth_path) as f:
synth_records = [json.loads(line) for line in f]
# Concatenate all fields for embedding
real_texts = [' '.join(str(v) for v in r.values()) for r in real_records]
synth_texts = [' '.join(str(v) for v in r.values()) for r in synth_records]
# Embed
real_embeddings = model.encode(real_texts, convert_to_numpy=True)
synth_embeddings = model.encode(synth_texts, convert_to_numpy=True)
# Nearest-neighbour leakage check
similarities = cosine_similarity(synth_embeddings, real_embeddings)
max_sims = np.max(similarities, axis=1)
mean_sim = np.mean(max_sims)
report_md = f"""# Synthetic Text Evaluation Report
Measures how closely synthetic records match nearest real records by embedding similarity.
Interpretation:
Flagged records (similarity > 0.8): """
for i, sim in enumerate(max_sims):
if sim > 0.8:
real_idx = np.argmax(similarities[i])
report_md += f"- Synth[{i}] ↔ Real[{real_idx}]: {sim:.3f}\n"
# N-gram diversity
from collections import Counter
import re
def extract_ngrams(texts, n=3):
all_ngrams = Counter()
for text in texts:
tokens = re.findall(r'\w+', text.lower())
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i:i+n])
all_ngrams[ngram] += 1
return all_ngrams
real_ngrams = extract_ngrams(real_texts, n=3)
synth_ngrams = extract_ngrams(synth_texts, n=3)
report_md += f"""
If leakage scores > 0.7: Consider re-running synthesis with stricter transformation rules
If diversity low: May need more varied personas or longer generation
Validate semantic quality by manual spot-check of a few records """
with open(f'{output_dir}/text_eval_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md', 'w') as f:
f.write(report_md)
print(f"Report saved")
return {"mean_similarity": mean_sim, "n_gram_diversity": len(synth_ngrams)}
reports/<timestamp>/