Model Evaluation Patterns

Overview

Comprehensive model evaluation using yardstick and related packages. Covers metrics for classification, regression, and survival outcomes, plus calibration and uncertainty quantification.

Classification Metrics

Binary Classification

library(yardstick)

# Hard predictions (class)
predictions |>
  accuracy(truth = outcome, estimate = .pred_class)

predictions |>
  sens(truth = outcome, estimate = .pred_class)  # sensitivity/recall

predictions |>
  spec(truth = outcome, estimate = .pred_class)  # specificity

predictions |>
  ppv(truth = outcome, estimate = .pred_class)   # precision

predictions |>
  npv(truth = outcome, estimate = .pred_class)

predictions |>
  f_meas(truth = outcome, estimate = .pred_class)  # F1 score

predictions |>
  kap(truth = outcome, estimate = .pred_class)  # Cohen's kappa

predictions |>
  mcc(truth = outcome, estimate = .pred_class)  # Matthews correlation

Probability-Based Metrics

# ROC AUC
predictions |>
  roc_auc(truth = outcome, .pred_positive_class)

# PR AUC (better for imbalanced data)
predictions |>
  pr_auc(truth = outcome, .pred_positive_class)

# Brier score
predictions |>
  brier_class(truth = outcome, .pred_positive_class)

# Log loss
predictions |>
  mn_log_loss(truth = outcome, .pred_positive_class)

# Gain capture (lift)
predictions |>
  gain_capture(truth = outcome, .pred_positive_class)

Multi-Class Classification

# Macro-averaged (average across classes)
predictions |>
  accuracy(truth = outcome, estimate = .pred_class)

predictions |>
  f_meas(truth = outcome, estimate = .pred_class, estimator = "macro")

# Micro-averaged (pool then calculate)
predictions |>
  f_meas(truth = outcome, estimate = .pred_class, estimator = "micro")

# Weighted by class prevalence
predictions |>
  f_meas(truth = outcome, estimate = .pred_class, estimator = "macro_weighted")

# Multi-class ROC AUC (one-vs-all)
predictions |>
  roc_auc(truth = outcome, .pred_class1:.pred_classN)

Metric Sets

# Create metric set for consistent evaluation
class_metrics <- metric_set(
  accuracy,
  sens,
  spec,
  ppv,
  f_meas,
  roc_auc
)

# Use in tuning
tune_results <- workflow |>
  tune_grid(
    resamples = cv_folds,
    metrics = class_metrics
  )

# Use on predictions
predictions |>
  class_metrics(truth = outcome, estimate = .pred_class, .pred_positive)

Regression Metrics

Standard Metrics

# RMSE (penalizes large errors)
predictions |>
  rmse(truth = outcome, estimate = .pred)

# MAE (robust to outliers)
predictions |>
  mae(truth = outcome, estimate = .pred)

# R-squared
predictions |>
  rsq(truth = outcome, estimate = .pred)

# R-squared traditional (can be negative)
predictions |>
  rsq_trad(truth = outcome, estimate = .pred)

# Mean absolute percentage error
predictions |>
  mape(truth = outcome, estimate = .pred)

# Symmetric MAPE
predictions |>
  smape(truth = outcome, estimate = .pred)

Robust and Alternative Metrics

# Huber loss (robust to outliers)
predictions |>
  huber_loss(truth = outcome, estimate = .pred)

# Concordance correlation coefficient
predictions |>
  ccc(truth = outcome, estimate = .pred)

# Index of ideality of correlation
predictions |>
  iic(truth = outcome, estimate = .pred)

Regression Metric Set

reg_metrics <- metric_set(
  rmse,
  mae,
  rsq,
  mape
)

predictions |>
  reg_metrics(truth = outcome, estimate = .pred)

Visualization

ROC Curves

# Generate ROC curve data
roc_data <- predictions |>
  roc_curve(truth = outcome, .pred_positive)

# Plot
autoplot(roc_data)

# Multiple models
all_predictions |>
  group_by(model) |>
  roc_curve(truth = outcome, .pred_positive) |>
  autoplot()

Precision-Recall Curves

pr_data <- predictions |>
  pr_curve(truth = outcome, .pred_positive)

autoplot(pr_data)

Gain and Lift Curves

# Gain curve
gain_data <- predictions |>
  gain_curve(truth = outcome, .pred_positive)
autoplot(gain_data)

# Lift curve
lift_data <- predictions |>
  lift_curve(truth = outcome, .pred_positive)
autoplot(lift_data)

Calibration Plots

# Calibration data
cal_data <- predictions |>
  cal_plot_breaks(truth = outcome, .pred_positive, num_breaks = 10)

# Plot calibration
autoplot(cal_data)

# Windowed calibration
cal_data <- predictions |>
  cal_plot_windowed(truth = outcome, .pred_positive)
autoplot(cal_data)

Confusion Matrix

# Generate confusion matrix
conf_mat <- predictions |>
  conf_mat(truth = outcome, estimate = .pred_class)

# Visualize
autoplot(conf_mat, type = "heatmap")
autoplot(conf_mat, type = "mosaic")

# Extract metrics from confusion matrix
summary(conf_mat)

Probability Calibration

Calibration Methods (probably package)

library(probably)

# Logistic calibration (Platt scaling)
cal_obj <- predictions |>
  cal_estimate_logistic(truth = outcome, .pred_positive)

calibrated <- predictions |>
  cal_apply(cal_obj)

# Isotonic regression
cal_obj <- predictions |>
  cal_estimate_isotonic(truth = outcome, .pred_positive)

# Beta calibration
cal_obj <- predictions |>
  cal_estimate_beta(truth = outcome, .pred_positive)

Calibration in Workflow

# Add calibration to workflow
calibrated_wf <- workflow |>
  add_model(model_spec) |>
  add_recipe(recipe) |>
  add_calibration()  # not yet in tidymodels but conceptually

Threshold Optimization

Finding Optimal Threshold

library(probably)

# Optimize for J-index (sens + spec - 1)
threshold_perf <- predictions |>
  threshold_perf(
    truth = outcome,
    .pred_positive,
    thresholds = seq(0.1, 0.9, by = 0.05),
    metrics = metric_set(j_index, sens, spec)
  )

# Find optimal
best_threshold <- threshold_perf |>
  filter(.metric == "j_index") |>
  slice_max(.estimate)

# Apply threshold
predictions |>
  mutate(.pred_class = make_two_class_pred(.pred_positive, levels(outcome), threshold = 0.4))

Cost-Sensitive Thresholds

# With different misclassification costs
cost_matrix <- matrix(c(0, 1, 5, 0), nrow = 2)  # FN costs 5x FP

predictions |>
  classification_cost(
    truth = outcome,
    .pred_positive,
    costs = cost_matrix
  )

Confidence and Prediction Intervals

Bootstrap Confidence Intervals

# Bootstrap metric estimates
boot_metrics <- bootstraps(predictions, times = 1000) |>
  mutate(
    metrics = map(splits, ~ {
      analysis(.x) |>
        accuracy(truth = outcome, estimate = .pred_class)
    })
  ) |>
  unnest(metrics)

# Calculate CI
quantile(boot_metrics$.estimate, c(0.025, 0.975))

Prediction Intervals (Conformal)

library(probably)

# Conformal prediction intervals
conf_obj <- predictions |>
  conformal_cv(outcome ~ ., data = train_data, cv_folds)

# Predict with intervals
predict(conf_obj, new_data, level = 0.95)

Model Comparison

Comparing Resampled Models

# Collect metrics from multiple workflows
wf_results <- workflow_set |>
  workflow_map(resamples = cv_folds)

# Compare
autoplot(wf_results)
rank_results(wf_results, rank_metric = "roc_auc")

# Statistical comparison
# (informally via confidence intervals)
collect_metrics(wf_results) |>
  filter(.metric == "roc_auc") |>
  ggplot(aes(x = wflow_id, y = mean, ymin = mean - std_err, ymax = mean + std_err)) +
  geom_pointrange()

Paired Comparisons

# Resample-level comparison
library(tidyposterior)

# ANOVA-like comparison
perf_mod <- perf_mod(wf_results, metric = "roc_auc")

# Contrasts
contrast_models(perf_mod, list_1 = "model_A", list_2 = "model_B")

Evaluation Best Practices

Training vs Test Performance

# Collect training CV metrics
train_metrics <- collect_metrics(tune_results)

# Get test metrics
test_metrics <- final_fit |>
  collect_metrics()

# Compare for overfitting
bind_rows(
  train_metrics |> mutate(set = "CV"),
  test_metrics |> mutate(set = "Test")
)

Stratified Evaluation

# Performance by subgroup
predictions |>
  group_by(subgroup) |>
  metrics(truth = outcome, estimate = .pred_class, .pred_positive)

Key Metrics by Problem Type

Problem	Primary Metric	Secondary Metrics
Binary balanced	ROC AUC	Accuracy, F1
Binary imbalanced	PR AUC, F1	Sens, PPV
Multi-class	Macro F1	Accuracy, Kappa
Regression	RMSE	MAE, R²
Regression with outliers	MAE, Huber	RMSE
Rare events	PR AUC	Sens, PPV
Medical diagnosis	Sens, Spec	NPV, PPV

Model Evaluation Patterns

Model Evaluation Patterns

Overview

Classification Metrics

Binary Classification

Probability-Based Metrics

Multi-Class Classification

Metric Sets

Regression Metrics

Standard Metrics

Robust and Alternative Metrics

Regression Metric Set

Visualization

ROC Curves

Precision-Recall Curves

Gain and Lift Curves

Calibration Plots

Confusion Matrix

Probability Calibration

Calibration Methods (probably package)

Calibration in Workflow

Threshold Optimization

Finding Optimal Threshold

Cost-Sensitive Thresholds

Confidence and Prediction Intervals

Bootstrap Confidence Intervals

Prediction Intervals (Conformal)

Model Comparison

Comparing Resampled Models

Paired Comparisons

Evaluation Best Practices

Training vs Test Performance

Stratified Evaluation

Key Metrics by Problem Type

Similar Skills