## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----setup, message=FALSE, warning=FALSE-------------------------------------- library(ci) library(dplyr) ## ----eval=FALSE--------------------------------------------------------------- # # Binomial CI: two categories (e.g., mutant/wild-type) # ci_binom(x = 54, n = 80) # # # Multinomial CI: 3+ categories (e.g., blood types) # ci_multinom(c("A" = 20, "B" = 35, "AB" = 25, "O" = 15)) # # # Mean CI based on data: average with groups # data |> # group_by(group_var) |> # ci_mean_t(numeric_var) # # # Mean CI based on summary statistics (mean, SD, n): e.g., literature values # ci_mean_t_stat(mean_ = 75, sd_ = 10, n = 25) # # # Any statistic (median, correlation, etc.) # data |> ci_boot(var1, var2, FUN = cor, R = 1000) ## ----------------------------------------------------------------------------- # 54 out of 80 Drosophila are mutation-free ci_binom(x = 54, n = 80) ## ----paged.print=FALSE-------------------------------------------------------- ci_n1 <- ci_binom(x = 54, n = 80) # Small sample ci_n2 <- ci_binom(x = 135, n = 200) # Larger sample ci_n3 <- ci_binom(x = 675, n = 1000) # Even larger sample ## ----------------------------------------------------------------------------- bind_rows(ci_n1, ci_n2, ci_n3) |> mutate(diff_upr_lwr = round(upr.ci - lwr.ci, digits = 3)) ## ----------------------------------------------------------------------------- ci_q1 <- ci_binom(x = 54, n = 80, conf.level = 0.90) # Less confident, narrower ci_q2 <- ci_binom(x = 54, n = 80, conf.level = 0.95) # Standard ci_q3 <- ci_binom(x = 54, n = 80, conf.level = 0.99) # More confident, wider ## ----------------------------------------------------------------------------- bind_rows(ci_q1, ci_q2, ci_q3) |> mutate(diff_upr_lwr = round(upr.ci - lwr.ci, digits = 3)) ## ----------------------------------------------------------------------------- # Blood type distribution blood_types <- c("A" = 20, "B" = 35, "AB" = 25, "O" = 15) ci_multinom(blood_types) ## ----------------------------------------------------------------------------- # Transportation preferences c("Car" = 20, "Bus" = 45, "Bike" = 15, "Walk" = 20) |> ci_multinom(method = "goodman") ## ----------------------------------------------------------------------------- # Using built-in dataset as example data(npk, package = "datasets") head(npk, n = 3) ## ----------------------------------------------------------------------------- # Average crop yield (overall) npk |> ci_mean_t(yield) ## ----------------------------------------------------------------------------- # Compare yields with and without nitrogen fertilizer npk |> group_by(N) |> ci_mean_t(yield) ## ----------------------------------------------------------------------------- # Multiple grouping factors npk |> group_by(N, P, K) |> ci_mean_t(yield) ## ----------------------------------------------------------------------------- data(iris, package = "datasets") head(iris, n = 3) ## ----------------------------------------------------------------------------- # Petal length by flower species iris |> group_by(Species) |> ci_mean_t(Petal.Length) ## ----------------------------------------------------------------------------- # Calculate CI from reported statistics ci_mean_t_stat(mean_ = 75, sd_ = 10, n = 25) ## ----------------------------------------------------------------------------- # Three enzyme activity measurements mean_val <- c(78, 82, 75) std_dev <- c(8, 7, 9) n <- c(30, 28, 32) group <- c("Enzyme A", "Enzyme B", "Enzyme C") ci_mean_t_stat(mean_val, std_dev, n, group) ## ----------------------------------------------------------------------------- # Three different treatment conditions from literature treatments <- tibble( treatment = c("Control", "Low dose", "High dose"), mean_response = c(78, 82, 75), sd = c(8, 7, 9), n = c(30, 28, 32) ) treatments |> with(ci_mean_t_stat(mean_response, sd, n, treatment)) ## ----------------------------------------------------------------------------- # Same mean and SD but four different sample sizes ci_mean_t_stat(mean_ = 75, sd_ = 10, n = c(10, 25, 50, 100)) ## ----------------------------------------------------------------------------- set.seed(123) # For reproducible results # Median petal length for all flowers iris |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "bca") ## ----------------------------------------------------------------------------- set.seed(456) # Median by species iris |> group_by(Species) |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "bca") ## ----------------------------------------------------------------------------- set.seed(789) # How variable is petal length? iris |> ci_boot(Petal.Length, FUN = sd, R = 1000, bci.method = "bca") ## ----------------------------------------------------------------------------- set.seed(101) # Relationship between petal length and width for each species iris |> group_by(Species) |> ci_boot( Petal.Length, Petal.Width, FUN = cor, method = "pearson", R = 1000, bci.method = "bca" ) ## ----------------------------------------------------------------------------- set.seed(101) # Correlation between petal length and width iris |> group_by(Species) |> ci_boot( Petal.Length, Petal.Width, FUN = cor, method = "spearman", R = 1000, bci.method = "bca" ) ## ----------------------------------------------------------------------------- set.seed(222) # Percentile method (simpler, more robust) iris |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "perc") # BCa method (often more accurate) iris |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "bca") ## ----------------------------------------------------------------------------- # Same data, different confidence levels ci_mean_t_stat(mean_ = 75, sd_ = 10, n = 25, conf.level = 0.90) ci_mean_t_stat(mean_ = 75, sd_ = 10, n = 25, conf.level = 0.95) ci_mean_t_stat(mean_ = 75, sd_ = 10, n = 25, conf.level = 0.99) ## ----------------------------------------------------------------------------- # Effect of sample size on precision ci_mean_t_stat(mean_ = 85, sd_ = 15, n = c(10, 25, 50, 100, 400)) ## ----------------------------------------------------------------------------- # Two treatment methods ci_mean_t_stat( mean_ = c(78, 82), sd_ = c(8, 7), n = c(30, 28), group = c("Treatment A", "Treatment B") ) ## ----------------------------------------------------------------------------- # Function to run bootstrap CI and measure computation time simulate_ci <- function(R) { # system.time() tracks how long the code takes to run time_s <- system.time( result <- iris |> ci_boot(Petal.Length, FUN = mean, R = R, bci.method = "bca") ) # Add useful metrics to the results: # - diff: CI width (difference between upper and lower bounds) # - R: number of resamples used # - time_s: elapsed computation time in seconds result <- result |> mutate( diff = (upr.ci - lwr.ci) |> round(digits = 3), upr.ci = upr.ci |> round(digits = 3), lwr.ci = lwr.ci |> round(digits = 3), time_s = time_s[3], # [3] extracts elapsed time (wall-clock time), R = R, ) return(result) } ## ----------------------------------------------------------------------------- # Run simulation with different numbers of bootstrap resamples # Each R value is tested 5 times to assess variability set.seed(307) results <- bind_rows( # 500 resamples simulate_ci(R = 500), simulate_ci(R = 500), simulate_ci(R = 500), simulate_ci(R = 500), simulate_ci(R = 500), # 1000 resamples simulate_ci(R = 1000), simulate_ci(R = 1000), simulate_ci(R = 1000), simulate_ci(R = 1000), simulate_ci(R = 1000), # 3000 resamples simulate_ci(R = 3000), simulate_ci(R = 3000), simulate_ci(R = 3000), simulate_ci(R = 3000), simulate_ci(R = 3000), # 5000 resamples simulate_ci(R = 5000), simulate_ci(R = 5000), simulate_ci(R = 5000), simulate_ci(R = 5000), simulate_ci(R = 5000), # 9999 resamples simulate_ci(R = 9999), simulate_ci(R = 9999), simulate_ci(R = 9999), simulate_ci(R = 9999), simulate_ci(R = 9999), ) print(results, n = Inf) ## ----------------------------------------------------------------------------- # Summary by number of replications (R) summary_of_results <- results |> group_by(R) |> summarize( diff_mean = mean(diff) |> round(digits = 3), diff_sd = sd(diff) |> round(digits = 3), time_mean = mean(time_s) |> round(digits = 2), time_sd = sd(time_s) |> round(digits = 2) ) summary_of_results