Skill

data-analysis

Provides Python patterns for loading datasets with pandas, exploratory data analysis (descriptive stats, distributions, correlations), statistical tests (t-test, ANOVA), and confidence intervals using scipy and numpy.

Python

data-engineering

npx claudepluginhub yeachan-heo/my-jogyo --plugin gyoshu

Tool Access

This skill uses the workspace's default tool permissions.

Preview

Load this skill when working with datasets that require exploration, cleaning, and statistical analysis.

SKILL.md

Similar Skills

finishing-a-development-branch

174.6k

Verifies tests pass on completed feature branch, presents options to merge locally, create GitHub PR, keep as-is or discard; executes choice and cleans up worktree.

superpowers

systematic-debugging

174.6k

Guides root cause investigation for bugs, test failures, unexpected behavior, performance issues, and build failures before proposing fixes.

10 files

superpowers

writing-plans

174.6k

Writes implementation plans from specs for multi-step tasks, mapping files and breaking into TDD bite-sized steps before coding.

1 file

superpowers

Stats

Stars152

Forks54

Last CommitJan 13, 2026

Actions

View Source View Plugin View on GitHub View README

Data Analysis Patterns

When to Use

Load this skill when working with datasets that require exploration, cleaning, and statistical analysis.

Data Loading

print("[DATA] Loading dataset")
df = pd.read_csv("data.csv")
print(f"[SHAPE] {df.shape[0]} rows, {df.shape[1]} columns")
print(f"[DTYPE] {dict(df.dtypes)}")
print(f"[MISSING] {df.isnull().sum().to_dict()}")

Exploratory Data Analysis (EDA)

Descriptive Statistics

print("[STAT] Descriptive statistics:")
print(df.describe())

print(f"[RANGE] {col}: {df[col].min()} to {df[col].max()}")

Distribution Analysis

print("[ANALYSIS] Checking distribution normality")
from scipy import stats
stat, p_value = stats.shapiro(df[col])
print(f"[STAT] Shapiro-Wilk p-value: {p_value:.4f}")

Correlation Analysis

print("[CORR] Correlation matrix:")
print(df.corr())

Statistical Tests

T-Test

from scipy.stats import ttest_ind
stat, p = ttest_ind(group1, group2)
print(f"[STAT] T-test: t={stat:.3f}, p={p:.4f}")

ANOVA

from scipy.stats import f_oneway
stat, p = f_oneway(group1, group2, group3)
print(f"[STAT] ANOVA: F={stat:.3f}, p={p:.4f}")

Confidence Interval Patterns

Parametric CI for Means

import numpy as np
from scipy import stats

def mean_ci(data, confidence=0.95):
    """Calculate parametric confidence interval for mean."""
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)  # Standard error of mean
    h = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean, mean - h, mean + h

mean, ci_low, ci_high = mean_ci(df[col])
print(f"[STAT:estimate] mean = {mean:.3f}")
print(f"[STAT:ci] 95% CI [{ci_low:.3f}, {ci_high:.3f}]")

Bootstrap CI for Medians/Complex Statistics

import numpy as np

def bootstrap_ci(data, stat_func=np.median, n_bootstrap=10000, confidence=0.95):
    """Calculate bootstrap confidence interval for any statistic."""
    boot_stats = []
    n = len(data)
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        boot_stats.append(stat_func(sample))

    alpha = 1 - confidence
    ci_low = np.percentile(boot_stats, 100 * alpha / 2)
    ci_high = np.percentile(boot_stats, 100 * (1 - alpha / 2))
    return stat_func(data), ci_low, ci_high

median, ci_low, ci_high = bootstrap_ci(df[col], stat_func=np.median)
print(f"[STAT:estimate] median = {median:.3f}")
print(f"[STAT:ci] 95% Bootstrap CI [{ci_low:.3f}, {ci_high:.3f}]")

Wilson CI for Proportions

from scipy import stats

def wilson_ci(successes, trials, confidence=0.95):
    """Calculate Wilson score interval for proportions (better for small n)."""
    p = successes / trials
    z = stats.norm.ppf((1 + confidence) / 2)

    denominator = 1 + z**2 / trials
    center = (p + z**2 / (2 * trials)) / denominator
    spread = z * np.sqrt((p * (1 - p) + z**2 / (4 * trials)) / trials) / denominator

    return p, center - spread, center + spread

prop, ci_low, ci_high = wilson_ci(successes=45, trials=100)
print(f"[STAT:estimate] proportion = {prop:.3f}")
print(f"[STAT:ci] 95% Wilson CI [{ci_low:.3f}, {ci_high:.3f}]")

Effect Size Calculation

Cohen's d for Group Comparisons

import numpy as np

def cohens_d(group1, group2):
    """Calculate Cohen's d effect size for two independent groups."""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)

    # Pooled standard deviation
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    d = (np.mean(group1) - np.mean(group2)) / pooled_std

    # Interpretation
    magnitude = "small" if abs(d) < 0.5 else "medium" if abs(d) < 0.8 else "large"
    return d, magnitude

d, magnitude = cohens_d(treatment, control)
print(f"[STAT:effect_size] Cohen's d = {d:.3f} ({magnitude})")

r² for Correlations

from scipy import stats

def correlation_r2(x, y):
    """Calculate Pearson r and r² with interpretation."""
    r, p = stats.pearsonr(x, y)
    r2 = r ** 2

    # Interpretation (based on Cohen's guidelines for r)
    magnitude = "small" if abs(r) < 0.3 else "medium" if abs(r) < 0.5 else "large"
    return r, r2, p, magnitude

r, r2, p, magnitude = correlation_r2(df[x_col], df[y_col])
print(f"[STAT:estimate] r = {r:.3f}")
print(f"[STAT:effect_size] r² = {r2:.3f} ({magnitude} effect, {r2*100:.1f}% variance explained)")
print(f"[STAT:p_value] p = {p:.4f}")

Cliff's Delta for Non-Parametric Comparisons

import numpy as np

def cliffs_delta(group1, group2):
    """Calculate Cliff's delta (non-parametric effect size)."""
    n1, n2 = len(group1), len(group2)

    # Count dominance
    more = sum(1 for x in group1 for y in group2 if x > y)
    less = sum(1 for x in group1 for y in group2 if x < y)
    delta = (more - less) / (n1 * n2)

    # Interpretation (Romano et al., 2006)
    abs_d = abs(delta)
    magnitude = "negligible" if abs_d < 0.147 else "small" if abs_d < 0.33 else "medium" if abs_d < 0.474 else "large"
    return delta, magnitude

delta, magnitude = cliffs_delta(treatment, control)
print(f"[STAT:effect_size] Cliff's delta = {delta:.3f} ({magnitude})")

Assumption Checking

Normality: Shapiro-Wilk and Q-Q Plot

from scipy import stats
import matplotlib.pyplot as plt

def check_normality(data, col_name="variable", alpha=0.05):
    """Check normality assumption with Shapiro-Wilk test and Q-Q plot."""
    # Shapiro-Wilk test (best for n < 5000)
    stat, p = stats.shapiro(data)
    is_normal = p > alpha

    print(f"[CHECK:normality] Shapiro-Wilk W={stat:.4f}, p={p:.4f}")
    print(f"[CHECK:normality] {'PASS' if is_normal else 'FAIL'}: Data {'is' if is_normal else 'is NOT'} normally distributed (α={alpha})")

    # Q-Q plot for visual inspection
    fig, ax = plt.subplots(figsize=(6, 6))
    stats.probplot(data, dist="norm", plot=ax)
    ax.set_title(f"Q-Q Plot: {col_name}")
    plt.savefig(f"reports/figures/qq_plot_{col_name}.png", dpi=150, bbox_inches="tight")
    plt.close()

    return is_normal, stat, p

is_normal, stat, p = check_normality(df[col], col_name=col)

Homogeneity of Variance: Levene's Test

from scipy import stats

def check_homogeneity(*groups, alpha=0.05):
    """Check homogeneity of variance (homoscedasticity) with Levene's test."""
    stat, p = stats.levene(*groups)
    is_homogeneous = p > alpha

    print(f"[CHECK:homogeneity] Levene's W={stat:.4f}, p={p:.4f}")
    print(f"[CHECK:homogeneity] {'PASS' if is_homogeneous else 'FAIL'}: Variances {'are' if is_homogeneous else 'are NOT'} equal (α={alpha})")

    if not is_homogeneous:
        print("[CHECK:homogeneity] Recommendation: Use Welch's t-test instead of Student's t-test")

    return is_homogeneous, stat, p

is_homogeneous, stat, p = check_homogeneity(group1, group2)

Independence: Durbin-Watson Test (for Regression Residuals)

from statsmodels.stats.stattools import durbin_watson

def check_independence(residuals):
    """Check independence of residuals with Durbin-Watson test."""
    dw_stat = durbin_watson(residuals)

    # Interpretation: DW ≈ 2 means no autocorrelation
    # DW < 1.5 suggests positive autocorrelation
    # DW > 2.5 suggests negative autocorrelation
    if dw_stat < 1.5:
        status = "FAIL - positive autocorrelation detected"
    elif dw_stat > 2.5:
        status = "FAIL - negative autocorrelation detected"
    else:
        status = "PASS - no significant autocorrelation"

    print(f"[CHECK:independence] Durbin-Watson = {dw_stat:.3f}")
    print(f"[CHECK:independence] {status}")

    return dw_stat, status

dw_stat, status = check_independence(model.resid)

Robust Alternatives

Welch's t-test (Instead of Student's t-test)

from scipy import stats

def welchs_ttest(group1, group2, alpha=0.05):
    """
    Welch's t-test - DEFAULT choice for comparing two groups.
    Does NOT assume equal variances (more robust than Student's t-test).
    """
    stat, p = stats.ttest_ind(group1, group2, equal_var=False)  # equal_var=False for Welch's

    print(f"[DECISION] Using Welch's t-test: Does not assume equal variances")
    print(f"[STAT:estimate] t-statistic = {stat:.3f}")
    print(f"[STAT:p_value] p = {p:.4f}")

    # Effect size
    from numpy import sqrt, var, mean
    n1, n2 = len(group1), len(group2)
    pooled_std = sqrt(((n1-1)*var(group1, ddof=1) + (n2-1)*var(group2, ddof=1)) / (n1+n2-2))
    d = (mean(group1) - mean(group2)) / pooled_std
    magnitude = "small" if abs(d) < 0.5 else "medium" if abs(d) < 0.8 else "large"
    print(f"[STAT:effect_size] Cohen's d = {d:.3f} ({magnitude})")

    return stat, p, d

t_stat, p_value, effect_size = welchs_ttest(treatment, control)

Mann-Whitney U Test (for Non-Normal Data)

from scipy import stats
import numpy as np

def mann_whitney_test(group1, group2, alpha=0.05):
    """
    Mann-Whitney U test - Non-parametric alternative to t-test.
    Use when normality assumption is violated.
    """
    stat, p = stats.mannwhitneyu(group1, group2, alternative='two-sided')

    print(f"[DECISION] Using Mann-Whitney U: Non-parametric, does not assume normality")
    print(f"[STAT:estimate] U-statistic = {stat:.3f}")
    print(f"[STAT:p_value] p = {p:.4f}")

    # Effect size: Cliff's delta (appropriate for non-parametric)
    n1, n2 = len(group1), len(group2)
    more = sum(1 for x in group1 for y in group2 if x > y)
    less = sum(1 for x in group1 for y in group2 if x < y)
    delta = (more - less) / (n1 * n2)
    magnitude = "negligible" if abs(delta) < 0.147 else "small" if abs(delta) < 0.33 else "medium" if abs(delta) < 0.474 else "large"
    print(f"[STAT:effect_size] Cliff's delta = {delta:.3f} ({magnitude})")

    return stat, p, delta

u_stat, p_value, effect_size = mann_whitney_test(treatment, control)

Permutation Test (for Complex Designs)

import numpy as np

def permutation_test(group1, group2, n_permutations=10000, stat_func=None):
    """
    Permutation test - Most robust, makes minimal assumptions.
    Use when parametric assumptions are violated or for complex statistics.
    """
    if stat_func is None:
        stat_func = lambda x, y: np.mean(x) - np.mean(y)

    observed = stat_func(group1, group2)
    combined = np.concatenate([group1, group2])
    n1 = len(group1)

    # Generate permutation distribution
    perm_stats = []
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        perm_stat = stat_func(combined[:n1], combined[n1:])
        perm_stats.append(perm_stat)

    # Two-tailed p-value
    p_value = np.mean(np.abs(perm_stats) >= np.abs(observed))

    print(f"[DECISION] Using permutation test: Assumption-free, {n_permutations} permutations")
    print(f"[STAT:estimate] Observed difference = {observed:.4f}")
    print(f"[STAT:p_value] p = {p_value:.4f} (permutation-based)")

    # Bootstrap CI for the observed statistic
    boot_diffs = []
    for _ in range(n_permutations):
        b1 = np.random.choice(group1, size=len(group1), replace=True)
        b2 = np.random.choice(group2, size=len(group2), replace=True)
        boot_diffs.append(stat_func(b1, b2))
    ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])
    print(f"[STAT:ci] 95% Bootstrap CI [{ci_low:.4f}, {ci_high:.4f}]")

    return observed, p_value, ci_low, ci_high

obs, p_val, ci_low, ci_high = permutation_test(treatment, control)

Memory Management

print(f"[MEMORY] DataFrame size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# Clean up
del large_df
import gc; gc.collect()