## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----------------------------------------------------------------------------- library(regressinator) linear_pop <- population( x1 = predictor(rnorm, mean = 4, sd = 10), x2 = predictor(runif, min = 0, max = 10), y = response( 0.7 + 2.2 * x1 - 0.2 * x2, # relationship between X and Y family = gaussian(), # link function and response distribution error_scale = 1.5 # errors are scaled by this amount ) ) ## ----------------------------------------------------------------------------- logistic_pop <- population( x1 = predictor(rnorm, mean = 0, sd = 10), x2 = predictor(runif, min = 0, max = 10), y = response(0.7 + 2.2 * x1 - 0.2 * x2, family = binomial(link = "logit")) ) ## ----------------------------------------------------------------------------- heteroskedastic_pop <- population( x1 = predictor(rnorm, mean = 5, sd = 4), x2 = predictor(runif, min = 0, max = 10), y = response( 4 + 2 * x1 - 3 * x2, # relationship between X and Y family = ols_with_error(rnorm), # distribution of the errors error_scale = 0.5 + x2 / 10 # errors depend on x2 ) ) ## ----------------------------------------------------------------------------- heavy_tail_pop <- population( x1 = predictor(rnorm, mean = 5, sd = 4), x2 = predictor(runif, min = 0, max = 10), y = response( 4 + 2 * x1 - 3 * x2, # relationship between X and Y family = ols_with_error(rt, df = 3), # distribution of the errors error_scale = 1.0 # errors are multiplied by this scale factor ) ) ## ----------------------------------------------------------------------------- # 40% of draws have lambda = 0, the rest have lambda given by the inverse link zeroinfpois <- function(ys) { n <- length(ys) rpois(n, lambda = ys * rbinom(n, 1, prob = 0.4)) } pop <- population( x1 = predictor(rnorm, mean = 2, sd = 2), y = response( 0.7 + 0.8 * x1, family = custom_family(zeroinfpois, exp) ) ) ## ----------------------------------------------------------------------------- # default is equally likely levels rfactor(5, c("foo", "bar", "baz")) # but probabilities per level can be specified rfactor(5, c("foo", "bar", "baz"), c(0.4, 0.3, 0.3)) ## ----------------------------------------------------------------------------- intercepts <- c("foo" = 2, "bar" = 30, "baz" = 7) factor_intercept_pop <- population( group = predictor(rfactor, levels = c("foo", "bar", "baz"), prob = c(0.1, 0.6, 0.3)), x = predictor(runif, min = 0, max = 10), y = response(by_level(group, intercepts) + 0.3 * x, error_scale = 1.5) ) ## ----------------------------------------------------------------------------- slopes <- c("foo" = 2, "bar" = 30, "baz" = 7) factor_slope_pop <- population( group = predictor(rfactor, levels = c("foo", "bar", "baz"), prob = c(0.1, 0.6, 0.3)), x = predictor(runif, min = 0, max = 10), y = response(7 + by_level(group, slopes) * x, error_scale = 1.5) ) ## ----------------------------------------------------------------------------- linear_samp <- sample_y(sample_x(linear_pop, n = 10)) ## ----------------------------------------------------------------------------- logistic_pop |> sample_x(n = 10) |> sample_y() ## ----------------------------------------------------------------------------- fit <- lm(y ~ x1 + x2, data = linear_samp) ## ----------------------------------------------------------------------------- fit <- lm(linear_samp$y ~ linear_samp$x1 + linear_samp$x2) ## ----fig.width=4, fig.height=4------------------------------------------------ library(broom) nonlinear_pop <- population( x1 = predictor(runif, min = 1, max = 8), x2 = predictor(runif, min = 0, max = 10), y = response(0.7 + x1**2 - x2, family = gaussian(), error_scale = 4.0) ) nonlinear_data <- nonlinear_pop |> sample_x(n = 100) |> sample_y() nonlinear_fit <- lm(y ~ x1 + x2, data = nonlinear_data) # to see the kind of information we get from an augmented fit augment(nonlinear_fit) library(ggplot2) ggplot(augment(nonlinear_fit), aes(x = .fitted, y = .resid)) + geom_point() + labs(x = "Fitted value", y = "Residual") ## ----fig.height=4------------------------------------------------------------- ggplot(augment_longer(nonlinear_fit), aes(x = .predictor_value, y = .resid)) + geom_point() + facet_wrap(vars(.predictor_name), scales = "free") + labs(x = "Predictor value", y = "Residual") ## ----fig.height=4------------------------------------------------------------- ggplot(partial_residuals(nonlinear_fit), aes(x = .predictor_value, y = .partial_resid)) + geom_point() + # partial residuals geom_smooth(se = FALSE) + # smoothed residuals geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects facet_wrap(vars(.predictor_name), scales = "free") + labs(x = "Predictor value", y = "Partial residual") ## ----fig.height=4------------------------------------------------------------- quadratic_fit <- lm(y ~ poly(x1, 2) + x2, data = nonlinear_data) ggplot(partial_residuals(quadratic_fit), aes(x = .predictor_value, y = .partial_resid)) + geom_point() + # partial residuals geom_smooth(se = FALSE) + # smoothed residuals geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects facet_wrap(vars(.predictor_name), scales = "free") + labs(x = "Predictor value", y = "Partial residual") ## ----fig.width=6, fig.height=6------------------------------------------------ model_lineup(nonlinear_fit) |> ggplot(aes(x = .fitted, y = .resid)) + geom_point() + facet_wrap(vars(.sample)) + labs(x = "Fitted value", y = "Residual") ## ----fig.width=6, fig.height=6------------------------------------------------ heavy_tail_pop <- population( x1 = predictor(rnorm, mean = 5, sd = 4), x2 = predictor(runif, min = 0, max = 10), y = response( 4 + 2 * x1 - 3 * x2, family = ols_with_error(rt, df = 3), error_scale = 1.0 ) ) heavy_tail_sample <- heavy_tail_pop |> sample_x(n = 100) |> sample_y() fit <- lm(y ~ x1 + x2, data = heavy_tail_sample) model_lineup(fit) |> ggplot(aes(sample = .resid)) + geom_qq() + geom_qq_line() + facet_wrap(vars(.sample)) + labs(x = "Theoretical quantiles", y = "Observed quantiles") ## ----------------------------------------------------------------------------- d <- linear_pop |> sample_x(n = 30) |> sample_y() fit <- lm(y ~ x1 + x2, data = d) tidy(fit) sampling_distribution(fit, d, nsim = 4) ## ----fig.height=4------------------------------------------------------------- samples <- sampling_distribution(fit, d, nsim = 1000) samples |> ggplot(aes(x = estimate)) + geom_histogram() + facet_wrap(vars(term), scales = "free") + labs(x = "Estimate", y = "Frequency") ## ----message=FALSE------------------------------------------------------------ library(dplyr) samples |> group_by(term) |> summarize(mean = mean(estimate), sd = sd(estimate)) ## ----------------------------------------------------------------------------- quadratic_fit <- lm(y ~ x1 + I(x1^2) + x2, data = nonlinear_data) anova(nonlinear_fit, quadratic_fit) ## ----fig.height=4------------------------------------------------------------- quadratic_coefs <- parametric_boot_distribution(nonlinear_fit, quadratic_fit, data = nonlinear_data) |> filter(term == "I(x1^2)") ggplot(quadratic_coefs, aes(x = estimate)) + geom_histogram() + geom_vline(xintercept = coef(quadratic_fit)["I(x1^2)"], color = "red") + labs(x = "Quadratic term coefficient", y = "Count", title = "Null distribution of quadratic term")