Core workflow patterns for building machine learning models using the tidymodels ecosystem. Covers the complete pipeline from data splitting through model deployment.
/plugin marketplace add choxos/BiostatAgent/plugin install choxos-r-tidy-modeling-plugins-r-tidy-modeling@choxos/BiostatAgentThis skill inherits all available tools. When active, it can use any tool Claude has access to.
Core workflow patterns for building machine learning models using the tidymodels ecosystem. Covers the complete pipeline from data splitting through model deployment.
library(tidymodels)
# Basic train/test split
set.seed(123)
data_split <- initial_split(data, prop = 0.75, strata = outcome)
train_data <- training(data_split)
test_data <- testing(data_split)
# Validation set approach
data_split <- initial_validation_split(data, prop = c(0.6, 0.2))
train_data <- training(data_split)
val_data <- validation(data_split)
test_data <- testing(data_split)
# Create preprocessing recipe
recipe_spec <- recipe(outcome ~ ., data = train_data) |>
step_normalize(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors())
# Specify model with tune placeholders
model_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) |>
set_engine("ranger") |>
set_mode("classification")
# Combine recipe and model
workflow_spec <- workflow() |>
add_recipe(recipe_spec) |>
add_model(model_spec)
# Cross-validation folds
cv_folds <- vfold_cv(train_data, v = 10, strata = outcome)
# Bootstrap samples
boot_samples <- bootstraps(train_data, times = 25)
# Define tuning grid
tune_grid <- grid_regular(
mtry(range = c(2, 10)),
min_n(range = c(2, 20)),
levels = 5
)
# Tune model
tune_results <- workflow_spec |>
tune_grid(
resamples = cv_folds,
grid = tune_grid,
metrics = metric_set(roc_auc, accuracy)
)
# Select best parameters
best_params <- select_best(tune_results, metric = "roc_auc")
# Finalize workflow
final_workflow <- workflow_spec |>
finalize_workflow(best_params)
# Fit on full training data, evaluate on test
final_fit <- final_workflow |>
last_fit(data_split)
# Extract metrics
collect_metrics(final_fit)
# Extract predictions
collect_predictions(final_fit)
# Extract fitted workflow
fitted_wf <- extract_workflow(final_fit)
# Save model
saveRDS(fitted_wf, "output/models/final_model.rds")
# Predict on new data
predictions <- predict(fitted_wf, new_data)
library(tidymodels)
tidymodels_prefer()
# 1. Load and split data
set.seed(123)
data_split <- initial_split(ames, prop = 0.75, strata = Sale_Price)
# 2. Create recipe
ames_recipe <- recipe(Sale_Price ~ ., data = training(data_split)) |>
step_log(Sale_Price, base = 10) |>
step_other(Neighborhood, threshold = 0.05) |>
step_dummy(all_nominal_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_zv(all_predictors())
# 3. Specify model
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
learn_rate = tune()
) |>
set_engine("xgboost") |>
set_mode("regression")
# 4. Create workflow
xgb_wf <- workflow(ames_recipe, xgb_spec)
# 5. Setup resampling
cv_folds <- vfold_cv(training(data_split), v = 5)
# 6. Tune hyperparameters
xgb_tune <- xgb_wf |>
tune_grid(
resamples = cv_folds,
grid = 20,
metrics = metric_set(rmse, rsq)
)
# 7. Select best and finalize
best_xgb <- select_best(xgb_tune, metric = "rmse")
final_wf <- finalize_workflow(xgb_wf, best_xgb)
# 8. Final evaluation
final_fit <- last_fit(final_wf, data_split)
collect_metrics(final_fit)
# Create multiple preprocessing recipes
basic_recipe <- recipe(outcome ~ ., data = train) |>
step_normalize(all_numeric_predictors())
pca_recipe <- basic_recipe |>
step_pca(all_numeric_predictors(), num_comp = 5)
# Create multiple model specifications
lm_spec <- linear_reg() |> set_engine("lm")
rf_spec <- rand_forest(trees = 500) |> set_engine("ranger") |> set_mode("regression")
xgb_spec <- boost_tree() |> set_engine("xgboost") |> set_mode("regression")
# Create workflow set
wf_set <- workflow_set(
preproc = list(basic = basic_recipe, pca = pca_recipe),
models = list(lm = lm_spec, rf = rf_spec, xgb = xgb_spec)
)
# Fit all workflows
wf_results <- wf_set |>
workflow_map(
resamples = cv_folds,
grid = 10,
verbose = TRUE
)
# Compare results
autoplot(wf_results)
rank_results(wf_results, rank_metric = "rmse")
last_fit() for final evaluation on test setThis skill should be used when the user asks about libraries, frameworks, API references, or needs code examples. Activates for setup questions, code generation involving libraries, or mentions of specific frameworks like React, Vue, Next.js, Prisma, Supabase, etc.