Comprehensive patterns for feature engineering using the recipes package. Covers preprocessing steps for numeric, categorical, and text data while preventing information leakage.
/plugin marketplace add choxos/BiostatAgent/plugin install choxos-r-tidy-modeling-plugins-r-tidy-modeling@choxos/BiostatAgentThis skill inherits all available tools. When active, it can use any tool Claude has access to.
Comprehensive patterns for feature engineering using the recipes package. Covers preprocessing steps for numeric, categorical, and text data while preventing information leakage.
library(recipes)
# Initialize recipe with formula
rec <- recipe(outcome ~ ., data = training_data)
# Or with explicit roles
rec <- recipe(training_data) |>
update_role(outcome, new_role = "outcome") |>
update_role(id_column, new_role = "ID") |>
update_role(-outcome, -id_column, new_role = "predictor")
# Type-based selectors
all_predictors()
all_outcomes()
all_numeric_predictors()
all_nominal_predictors()
all_numeric()
all_nominal()
# Name-based selectors
starts_with("prefix_")
ends_with("_suffix")
contains("pattern")
matches("regex")
one_of(c("var1", "var2"))
rec <- recipe(outcome ~ ., data = train) |>
# Center and scale (z-score)
step_normalize(all_numeric_predictors()) |>
# Scale to [0, 1]
step_range(all_numeric_predictors(), min = 0, max = 1) |>
# Center only
step_center(all_numeric_predictors()) |>
# Scale only
step_scale(all_numeric_predictors())
rec <- recipe(outcome ~ ., data = train) |>
# Yeo-Johnson (handles zero and negative values)
step_YeoJohnson(all_numeric_predictors()) |>
# Box-Cox (positive values only)
step_BoxCox(positive_vars) |>
# Log transformation
step_log(skewed_vars, base = 10) |>
# Square root
step_sqrt(count_vars)
rec <- recipe(outcome ~ ., data = train) |>
# Natural splines
step_ns(continuous_var, deg_free = 5) |>
# B-splines
step_bs(continuous_var, deg_free = 5, degree = 3) |>
# Polynomial features
step_poly(continuous_var, degree = 3)
rec <- recipe(outcome ~ ., data = train) |>
# One-hot encoding (drop first level)
step_dummy(all_nominal_predictors()) |>
# Keep all levels
step_dummy(all_nominal_predictors(), one_hot = TRUE)
rec <- recipe(outcome ~ ., data = train) |>
# Pool infrequent levels
step_other(categorical_var, threshold = 0.05, other = "other") |>
# Handle novel levels in new data
step_novel(all_nominal_predictors()) |>
# Convert NA to explicit level
step_unknown(all_nominal_predictors())
library(embed)
rec <- recipe(outcome ~ ., data = train) |>
# Likelihood encoding
step_lencode_glm(high_cardinality_var, outcome = vars(outcome)) |>
# Mixed model encoding (for hierarchical data)
step_lencode_mixed(category, outcome = vars(outcome)) |>
# Weight of evidence
step_woe(categorical_var, outcome = vars(binary_outcome))
rec <- recipe(outcome ~ ., data = train) |>
# Simple imputation
step_impute_mean(all_numeric_predictors()) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# KNN imputation
step_impute_knn(all_predictors(), neighbors = 5) |>
# Bagged tree imputation
step_impute_bag(all_predictors()) |>
# Linear model imputation
step_impute_linear(numeric_var, impute_with = imp_vars(predictor1, predictor2))
rec <- recipe(outcome ~ ., data = train) |>
# Create indicator for missingness
step_indicate_na(all_predictors()) |>
# Then impute
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors())
rec <- recipe(outcome ~ ., data = train) |>
step_normalize(all_numeric_predictors()) |>
# Principal Component Analysis
step_pca(all_numeric_predictors(), num_comp = 5) |>
# Or keep components explaining variance threshold
step_pca(all_numeric_predictors(), threshold = 0.95)
library(embed)
rec <- recipe(outcome ~ ., data = train) |>
# UMAP
step_umap(all_numeric_predictors(), num_comp = 2) |>
# Kernel PCA
step_kpca(all_numeric_predictors(), num_comp = 5)
rec <- recipe(outcome ~ ., data = train) |>
# Create interaction terms
step_interact(terms = ~ var1:var2) |>
# Multiple interactions
step_interact(terms = ~ starts_with("x"):starts_with("z")) |>
# Ratios
step_ratio(numerator = denom_vars(var1), denom = denom_vars(var2))
rec <- recipe(outcome ~ ., data = train) |>
# Remove zero variance
step_zv(all_predictors()) |>
# Remove near-zero variance
step_nzv(all_predictors(), freq_cut = 95/5, unique_cut = 10) |>
# Remove highly correlated
step_corr(all_numeric_predictors(), threshold = 0.9) |>
# Remove linear combinations
step_lincomb(all_numeric_predictors())
library(themis)
rec <- recipe(outcome ~ ., data = train) |>
# Downsample majority class
step_downsample(outcome) |>
# Upsample minority class
step_upsample(outcome) |>
# SMOTE
step_smote(outcome) |>
# ADASYN
step_adasyn(outcome)
rec <- recipe(outcome ~ ., data = train) |>
# Extract date components
step_date(date_var, features = c("year", "month", "dow", "doy")) |>
# Holiday indicators
step_holiday(date_var, holidays = c("USChristmasDay", "USNewYearsDay")) |>
# Time components
step_time(datetime_var, features = c("hour", "minute"))
library(textrecipes)
rec <- recipe(outcome ~ ., data = train) |>
step_tokenize(text_var) |>
step_stopwords(text_var) |>
step_stem(text_var) |>
step_ngram(text_var, num_tokens = 2) |>
step_tfidf(text_var, max_tokens = 100)
# Prepare recipe (estimate parameters from training data)
prepped_rec <- prep(rec, training = train_data)
# Apply to training data
train_processed <- bake(prepped_rec, new_data = NULL) # or juice(prepped_rec)
# Apply to new data
test_processed <- bake(prepped_rec, new_data = test_data)
# Inspect recipe
tidy(prepped_rec)
tidy(prepped_rec, number = 1) # specific step
recipe(outcome ~ ., data = train) |>
# 1. Handle roles and IDs
update_role(id, new_role = "ID") |>
# 2. Impute missing values first
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
# 3. Handle individual variable issues
step_other(all_nominal_predictors(), threshold = 0.05) |>
step_novel(all_nominal_predictors()) |>
# 4. Transform numeric variables
step_YeoJohnson(all_numeric_predictors()) |>
# 5. Create interactions before encoding
step_interact(terms = ~ var1:var2) |>
# 6. Encode categorical variables
step_dummy(all_nominal_predictors()) |>
# 7. Normalize (after dummy coding)
step_normalize(all_numeric_predictors()) |>
# 8. Feature selection last
step_zv(all_predictors()) |>
step_corr(all_numeric_predictors())
This skill should be used when the user asks about libraries, frameworks, API references, or needs code examples. Activates for setup questions, code generation involving libraries, or mentions of specific frameworks like React, Vue, Next.js, Prisma, Supabase, etc.