From r-tidy-modeling
Comprehensive guide to resampling methods for model validation using the rsample package. Covers cross-validation, bootstrapping, and specialized resampling for time series and grouped data.
npx claudepluginhub choxos/biostatagent --plugin r-tidy-modelingThis skill uses the workspace's default tool permissions.
Comprehensive guide to resampling methods for model validation using the rsample package. Covers cross-validation, bootstrapping, and specialized resampling for time series and grouped data.
Creates new Angular apps using Angular CLI with flags for routing, SSR, SCSS, prefixes, and AI config. Follows best practices for modern TypeScript/Angular development. Use when starting Angular projects.
Generates Angular code and provides architectural guidance for projects, components, services, reactivity with signals, forms, dependency injection, routing, SSR, ARIA accessibility, animations, Tailwind styling, testing, and CLI tooling.
Provides UI/UX resources: 50+ styles, color palettes, font pairings, guidelines, charts for web/mobile across React, Next.js, Vue, Svelte, Tailwind, React Native, Flutter. Aids planning, building, reviewing interfaces.
Comprehensive guide to resampling methods for model validation using the rsample package. Covers cross-validation, bootstrapping, and specialized resampling for time series and grouped data.
library(rsample)
set.seed(123)
# Simple split (75% training)
split <- initial_split(data, prop = 0.75)
# Stratified split (maintain outcome proportions)
split <- initial_split(data, prop = 0.75, strata = outcome)
# Stratified with breaking for continuous outcomes
split <- initial_split(data, prop = 0.75, strata = outcome, breaks = 4)
# Extract sets
train <- training(split)
test <- testing(split)
# Single validation set
split <- initial_validation_split(data, prop = c(0.6, 0.2))
train <- training(split)
val <- validation(split)
test <- testing(split)
# Create validation set from resampling
val_set <- validation_set(split)
# Basic 10-fold CV
folds <- vfold_cv(train_data, v = 10)
# Stratified CV
folds <- vfold_cv(train_data, v = 10, strata = outcome)
# Repeated CV
folds <- vfold_cv(train_data, v = 10, repeats = 5, strata = outcome)
# Access individual folds
folds$splits[[1]]
analysis(folds$splits[[1]]) # training fold
assessment(folds$splits[[1]]) # validation fold
# LOO CV (useful for small datasets)
loo_folds <- loo_cv(train_data)
# Random repeated holdout
mc_folds <- mc_cv(train_data, prop = 0.75, times = 25)
# Standard bootstrap (with replacement)
boot <- bootstraps(train_data, times = 100)
# Stratified bootstrap
boot <- bootstraps(train_data, times = 100, strata = outcome)
# Apparent resampling (training = assessment for baseline)
apparent <- apparent(train_data)
# Bootstrap model coefficients
boot_models <- boot |>
mutate(
model = map(splits, ~ lm(y ~ x, data = analysis(.x))),
coefs = map(model, tidy)
) |>
unnest(coefs)
# Calculate confidence intervals
boot_ci <- int_pctl(boot_models, statistic = estimate)
# Keep groups together (e.g., patients, clusters)
group_folds <- group_vfold_cv(
data,
group = patient_id,
v = 10
)
# Outer resamples for model assessment
outer_folds <- vfold_cv(train_data, v = 5)
# Inner resamples for tuning (created during tune_grid)
tune_results <- workflow |>
tune_grid(
resamples = outer_folds, # outer loop
grid = 20
)
# Time series cross-validation
ts_folds <- rolling_origin(
ts_data,
initial = 365, # initial training window
assess = 30, # assessment window size
skip = 29, # skip between resamples
cumulative = TRUE # growing training window
)
# Fixed-size training window
ts_folds <- rolling_origin(
ts_data,
initial = 365,
assess = 30,
cumulative = FALSE # sliding window
)
# Alternative sliding window approach
ts_folds <- sliding_window(
ts_data,
lookback = 365, # training window size
assess_start = 1,
assess_stop = 30
)
# Sliding period (for date/time index)
ts_folds <- sliding_period(
ts_data,
index = date,
period = "month",
lookback = 12,
assess_stop = 1
)
library(spatialsample)
# Spatial block CV
spatial_folds <- spatial_block_cv(
sf_data,
v = 10,
buffer = 1000 # buffer distance in map units
)
# Spatial clustering CV
spatial_folds <- spatial_clustering_cv(
sf_data,
v = 10,
cluster_function = "kmeans"
)
# Leave-location-out CV
spatial_folds <- spatial_leave_location_out_cv(
sf_data,
group = location_id
)
# Using workflows
results <- workflow |>
fit_resamples(
resamples = folds,
metrics = metric_set(rmse, rsq, mae),
control = control_resamples(save_pred = TRUE)
)
# Collect metrics
collect_metrics(results)
# Collect predictions
collect_predictions(results)
# Create custom resample object
custom_splits <- list(
make_splits(
list(analysis = 1:100, assessment = 101:130),
data = my_data
),
make_splits(
list(analysis = 1:120, assessment = 121:150),
data = my_data
)
)
custom_rset <- manual_rset(
splits = custom_splits,
ids = c("Split1", "Split2")
)
| Method | Use Case | Variance | Bias |
|---|---|---|---|
| V-fold CV | General purpose | Medium | Low |
| Repeated CV | More stable estimates | Low | Low |
| LOOCV | Small datasets | High | Very Low |
| Bootstrap | Confidence intervals | Low | Medium |
| Monte Carlo CV | Large datasets | Medium | Low |
| Rolling Origin | Time series | - | Low |
| Group CV | Clustered data | Medium | Low |
# Classification with imbalanced classes
folds <- vfold_cv(data, v = 10, strata = outcome)
# Small dataset (n < 100)
folds <- vfold_cv(data, v = 5, repeats = 10, strata = outcome)
# or
folds <- loo_cv(data)
# Large dataset (n > 10000)
folds <- vfold_cv(data, v = 10) # single fold sufficient
# Time series
folds <- rolling_origin(data, initial = n_train, assess = n_test)
# Grouped/clustered data
folds <- group_vfold_cv(data, group = cluster_id, v = 10)
# Always stratify for:
# - Imbalanced classification (rare events)
# - Small datasets
# - Multi-class with varying frequencies
# Stratify continuous outcomes with breaks
folds <- vfold_cv(data, strata = continuous_outcome, breaks = 4)
# Tune with cross-validation
tune_results <- workflow |>
tune_grid(
resamples = vfold_cv(train, v = 10, strata = outcome),
grid = 20,
metrics = metric_set(roc_auc, accuracy)
)
# Tune with bootstrap
tune_results <- workflow |>
tune_bayes(
resamples = bootstraps(train, times = 25),
iter = 50
)