## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) ## ----packages----------------------------------------------------------------- library(miceFast) library(dplyr) library(ggplot2) set.seed(2025) ## ----quickstart--------------------------------------------------------------- data(air_miss) # 1. Impute m = 10 completed datasets completed <- lapply(1:10, function(i) { air_miss %>% mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))) }) # 2. Fit the analysis model on each fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) # 3. Pool using Rubin's rules summary(pool(fits)) ## ----mi-basic----------------------------------------------------------------- data(air_miss) # Step 1: Create m = 10 completed datasets m <- 10 completed <- lapply(1:m, function(i) { air_miss %>% mutate( Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") ) ) }) # Step 2: Fit the analysis model on each fits <- lapply(completed, function(d) { lm(Ozone_imp ~ Wind + Temp, data = d) }) # Step 3: Pool using Rubin's rules pool_result <- pool(fits) pool_result # Detailed summary with confidence intervals summary(pool_result) ## ----mi-mixed----------------------------------------------------------------- data(air_miss) impute_dataset <- function(data) { data %>% mutate( # Continuous: Bayesian linear regression Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") ), Solar_R_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept") ), # Categorical: LDA with random ridge for stochasticity Ozone_chac_imp = fill_NA( x = ., model = "lda", posit_y = "Ozone_chac", posit_x = c("Wind", "Temp"), ridge = runif(1, 0.5, 50) ) ) } set.seed(777) completed <- replicate(10, impute_dataset(air_miss), simplify = FALSE) # Fit and pool a model for continuous outcome fits_ozone <- lapply(completed, function(d) { lm(Ozone_imp ~ Wind + Temp + Solar_R_imp, data = d) }) pool(fits_ozone) ## ----mi-glm------------------------------------------------------------------- data(air_miss) # Create a binary outcome for demonstration completed <- lapply(1:10, function(i) { d <- air_miss %>% mutate( Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") ) ) d$high_ozone <- as.integer(d$Ozone_imp > median(d$Ozone_imp, na.rm = TRUE)) d }) fits_logit <- lapply(completed, function(d) { glm(high_ozone ~ Wind + Temp, data = d, family = binomial) }) pool(fits_logit) ## ----mi-grouped--------------------------------------------------------------- data(air_miss) completed_grouped <- lapply(1:5, function(i) { air_miss %>% group_by(groups) %>% do(mutate(., Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Wind", "Temp", "Intercept") ) )) %>% ungroup() }) fits <- lapply(completed_grouped, function(d) { lm(Ozone_imp ~ Wind + Temp + factor(groups), data = d) }) pool(fits) ## ----mi-weighted-------------------------------------------------------------- data(air_miss) completed_w <- lapply(1:5, function(i) { air_miss %>% mutate( Solar_R_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept"), w = weights ) ) }) fits_w <- lapply(completed_w, function(d) { lm(Solar_R_imp ~ Wind + Temp, data = d) }) pool(fits_w) ## ----mi-pmm-oop--------------------------------------------------------------- data(air_miss) dat <- as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]) dat <- cbind(dat, Intercept = 1) m <- 10 completed <- lapply(1:m, function(i) { model <- new(miceFast) model$set_data(dat + 0) # copy. set_data uses the matrix by reference. # impute("pmm", ...) draws from the Bayesian posterior and matches to nearest observed value model$update_var(1, model$impute("pmm", 1, c(3, 4, 5))$imputations) d <- as.data.frame(model$get_data()) colnames(d) <- c("Ozone", "Solar.R", "Wind", "Temp", "Intercept") d }) fits_pmm <- lapply(completed, function(d) { lm(Ozone ~ Wind + Temp, data = d) }) pool(fits_pmm) ## ----sensitivity-models------------------------------------------------------- data(air_miss) air_sensitivity <- air_miss %>% mutate( Ozone_bayes = fill_NA(x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")), Ozone_noise = fill_NA(x = ., model = "lm_noise", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")), Ozone_pmm = fill_NA_N(x = ., model = "pmm", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20), Ozone_pred = fill_NA(x = ., model = "lm_pred", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")) ) compare_imp(air_sensitivity, origin = "Ozone", target = c("Ozone_bayes", "Ozone_noise", "Ozone_pmm", "Ozone_pred")) ## ----sensitivity-m------------------------------------------------------------ set.seed(2025) results <- data.frame() for (m_val in c(3, 5, 10, 20, 50)) { completed <- lapply(1:m_val, function(i) { air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )) }) fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) s <- summary(pool(fits)) s$m <- m_val results <- rbind(results, s) } results %>% filter(term == "Wind") %>% ggplot(aes(x = m, y = estimate)) + geom_point(size = 3) + geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) + labs(title = "Stability of Wind coefficient across m", x = "Number of imputations (m)", y = "Pooled estimate") + theme_minimal() ## ----sensitivity-baselines---------------------------------------------------- data(air_miss) set.seed(2025) # 1. Complete cases (listwise deletion). Unbiased under MCAR. fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]) ci_cc <- confint(fit_cc) # 2. Mean imputation. Biased; underestimates variance. air_mean <- air_miss air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE) fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean) ci_mean <- confint(fit_mean) # 3. Deterministic regression imputation (lm_pred). No residual noise. air_pred <- air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_pred", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )) fit_pred <- lm(Ozone_imp ~ Wind + Temp, data = air_pred) ci_pred <- confint(fit_pred) # 4. Proper MI with Rubin's rules (lm_bayes, m = 20) completed <- lapply(1:20, function(i) { air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )) }) fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) pool_s <- summary(pool(fits)) # Compare Wind coefficient across all methods comparison <- data.frame( method = c("Complete cases", "Mean imputation", "Regression (lm_pred)", "MI (lm_bayes, m=20)"), estimate = c(coef(fit_cc)["Wind"], coef(fit_mean)["Wind"], coef(fit_pred)["Wind"], pool_s$estimate[pool_s$term == "Wind"]), ci_low = c(ci_cc["Wind", 1], ci_mean["Wind", 1], ci_pred["Wind", 1], pool_s$conf.low[pool_s$term == "Wind"]), ci_high = c(ci_cc["Wind", 2], ci_mean["Wind", 2], ci_pred["Wind", 2], pool_s$conf.high[pool_s$term == "Wind"]), n_used = c(nrow(air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]), nrow(air_miss), nrow(air_miss), nrow(air_miss)) ) comparison$ci_width <- comparison$ci_high - comparison$ci_low comparison ## ----vonhippel---------------------------------------------------------------- # Pilot with m = 5 set.seed(2025) pilot <- lapply(1:5, function(i) { air_miss %>% mutate(Ozone_imp = fill_NA(x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"))) }) pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))) # Inspect FMI, the key input for deciding m data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3)) # For the exact formula and its derivation see: # von Hippel (2020) "How many imputations do you need?", Sociological Methods & Research # R implementation: install.packages("howManyImputations")