From aj-geddes-useful-ai-prompts-4
Builds binary and multiclass classification models using logistic regression, decision trees, Random Forest, and Gradient Boosting with scikit-learn for tasks like churn prediction and fraud detection.
npx claudepluginhub joshuarweaver/cascade-code-languages-misc-1 --plugin aj-geddes-useful-ai-prompts-4This skill uses the workspace's default tool permissions.
Classification modeling predicts categorical target values, assigning observations to discrete classes or categories based on input features.
Searches, retrieves, and installs Agent Skills from prompts.chat registry using MCP tools like search_skills and get_skill. Activates for finding skills, browsing catalogs, or extending Claude.
Searches prompts.chat for AI prompt templates by keyword or category, retrieves by ID with variable handling, and improves prompts via AI. Use for discovering or enhancing prompts.
Checks Next.js compilation errors using a running Turbopack dev server after code edits. Fixes actionable issues before reporting complete. Replaces `next build`.
Classification modeling predicts categorical target values, assigning observations to discrete classes or categories based on input features.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
confusion_matrix, classification_report, roc_auc_score, roc_curve,
precision_recall_curve, f1_score, accuracy_score
)
import seaborn as sns
# Generate sample binary classification data
np.random.seed(42)
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
y_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_lr):.4f}\n")
# Decision Tree
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_dt):.4f}\n")
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_rf):.4f}\n")
# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)[:, 1]
print("Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_gb):.4f}\n")
# Confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
models = [
(y_pred_lr, 'Logistic Regression'),
(y_pred_dt, 'Decision Tree'),
(y_pred_rf, 'Random Forest'),
(y_pred_gb, 'Gradient Boosting'),
]
for idx, (y_pred, title) in enumerate(models):
cm = confusion_matrix(y_test, y_pred)
ax = axes[idx // 2, idx % 2]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(title)
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
plt.tight_layout()
plt.show()
# ROC Curves
plt.figure(figsize=(10, 8))
probas = [
(y_proba_lr, 'Logistic Regression'),
(y_proba_dt, 'Decision Tree'),
(y_proba_rf, 'Random Forest'),
(y_proba_gb, 'Gradient Boosting'),
]
for y_proba, label in probas:
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f'{label} (AUC={auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Precision-Recall Curves
plt.figure(figsize=(10, 8))
for y_proba, label in probas:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
plt.plot(recall, precision, label=f'{label} (F1={f1:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Tree-based feature importance
feature_importance_rf = pd.Series(
rf_model.feature_importances_, index=range(X.shape[1])
).sort_values(ascending=False)
axes[0].barh(range(10), feature_importance_rf.values[:10])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels([f'Feature {i}' for i in feature_importance_rf.index[:10]])
axes[0].set_title('Random Forest - Top 10 Features')
axes[0].set_xlabel('Importance')
# Logistic regression coefficients
lr_coef = pd.Series(lr_model.coef_[0], index=range(X.shape[1])).abs().sort_values(ascending=False)
axes[1].barh(range(10), lr_coef.values[:10])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels([f'Feature {i}' for i in lr_coef.index[:10]])
axes[1].set_title('Logistic Regression - Top 10 Features (abs coef)')
axes[1].set_xlabel('Absolute Coefficient')
plt.tight_layout()
plt.show()
# Model comparison
results = pd.DataFrame({
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
'Accuracy': [
accuracy_score(y_test, y_pred_lr),
accuracy_score(y_test, y_pred_dt),
accuracy_score(y_test, y_pred_rf),
accuracy_score(y_test, y_pred_gb),
],
'AUC-ROC': [
roc_auc_score(y_test, y_proba_lr),
roc_auc_score(y_test, y_proba_dt),
roc_auc_score(y_test, y_proba_rf),
roc_auc_score(y_test, y_proba_gb),
],
'F1-Score': [
f1_score(y_test, y_pred_lr),
f1_score(y_test, y_pred_dt),
f1_score(y_test, y_pred_rf),
f1_score(y_test, y_pred_gb),
]
})
print("Model Comparison:")
print(results)
# Cross-validation
cv_scores = cross_val_score(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train, y_train, cv=5, scoring='roc_auc'
)
print(f"\nCross-validation AUC scores: {cv_scores}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# Probability calibration
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_test, y_proba_rf, n_bins=10)
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, 'o-', label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()