From synthetic-data
Generate a synthetic tabular dataset from a JSON schema describing columns, types, and Faker providers.
npx claudepluginhub danielrosehill/claude-code-plugins --plugin synthetic-dataThis skill uses the workspace's default tool permissions.
Build a synthetic dataset from scratch using a user-supplied schema. Each column is defined by type (string, int, float, date, etc.), a Faker provider (e.g., "name", "email", "address"), or a distribution.
Conducts multi-round deep research on GitHub repos via API and web searches, generating markdown reports with executive summaries, timelines, metrics, and Mermaid diagrams.
Share bugs, ideas, or general feedback.
Build a synthetic dataset from scratch using a user-supplied schema. Each column is defined by type (string, int, float, date, etc.), a Faker provider (e.g., "name", "email", "address"), or a distribution.
name, type, faker_provider, optional locale and constraints./synthetic-data-workspace/outputs/)Prepare a schema file (e.g., schema.json):
{
"columns": [
{
"name": "customer_id",
"type": "int",
"distribution": {"type": "sequential"}
},
{
"name": "name",
"type": "string",
"faker_provider": "name"
},
{
"name": "email",
"type": "string",
"faker_provider": "email"
},
{
"name": "phone",
"type": "string",
"faker_provider": "phone_number"
},
{
"name": "address",
"type": "string",
"faker_provider": "address"
},
{
"name": "signup_date",
"type": "date",
"faker_provider": "date_between",
"faker_kwargs": {"start_date": "-5y"}
},
{
"name": "purchase_amount",
"type": "float",
"distribution": {"type": "normal", "mean": 100, "std": 25}
}
]
}
Write a generation script:
import json
import pandas as pd
import numpy as np
from faker import Faker
def generate_from_schema(schema_path, num_rows, output_path, locale='en_US'):
with open(schema_path) as f:
schema = json.load(f)
fake = Faker(locale)
data = {}
for col in schema['columns']:
name = col['name']
col_type = col['type']
provider = col.get('faker_provider')
if provider:
if col.get('faker_kwargs'):
data[name] = [getattr(fake, provider)(**col['faker_kwargs'])
for _ in range(num_rows)]
else:
data[name] = [getattr(fake, provider)() for _ in range(num_rows)]
elif 'distribution' in col:
dist = col['distribution']
if dist['type'] == 'sequential':
data[name] = list(range(1, num_rows + 1))
elif dist['type'] == 'normal':
data[name] = np.random.normal(dist['mean'], dist['std'], num_rows)
elif dist['type'] == 'uniform':
data[name] = np.random.uniform(dist['min'], dist['max'], num_rows)
else:
data[name] = [None] * num_rows
df = pd.DataFrame(data)
if output_path.endswith('.parquet'):
df.to_parquet(output_path, index=False)
else:
df.to_csv(output_path, index=False)
print(f"Generated {num_rows} rows to {output_path}")
print(df.head())
if __name__ == '__main__':
generate_from_schema('schema.json', 1000, 'synthetic_data.csv')
Run the script:
python generate_from_schema.py
Verify output:
head -5 synthetic_data.csv
wc -l synthetic_data.csv # Check row count
num_rows rows with realistic faker-generated valuesen_US, de_DE, ja_JP)