## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(eval = FALSE, echo = TRUE) ## ----load-packages------------------------------------------------------------ # # Load SurveyStat package # library(SurveyStat) # # # Load required packages # library(dplyr) # library(ggplot2) # library(knitr) # # # Set seed for reproducibility # set.seed(12345) ## ----load-data, eval = FALSE-------------------------------------------------- # # Create example survey dataset (embedded for vignette building) # survey_data <- data.frame( # Age = c(25, 34, 45, 28, 52, 31, 38, 29, 47, 33, 41, 26, 36, 43, 30), # Gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male"), # Education = c("High School", "Bachelor", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Bachelor", "Graduate"), # Income = c(35000, 52000, 68000, 31000, 85000, 48000, 42000, 62000, 71000, 38000, 78000, 45000, 55000, 65000, 72000), # Weight = c(0.85, 1.12, 0.95, 1.08, 0.92, 1.05, 0.98, 1.15, 0.88, 1.02, 0.91, 1.08, 0.96, 1.03, 0.89) # ) # # # Display basic information # cat("Dataset dimensions:", nrow(survey_data), "rows,", ncol(survey_data), "columns\n\n") # cat("Variable names:", paste(names(survey_data), collapse = ", "), "\n\n") # # # Display first few observations # # Note: Using knitr::kable for CRAN compatibility # knitr::kable(head(survey_data, 10), caption = "First 10 observations of the example survey dataset") ## ----remove-duplicates-------------------------------------------------------- # # Check for duplicates # n_duplicates <- nrow(survey_data) - nrow(unique(survey_data)) # cat("Number of duplicate rows:", n_duplicates, "\n") # # # Remove duplicates (if any exist) # clean_data <- remove_duplicates(survey_data) # cat("Dataset size after removing duplicates:", nrow(clean_data), "rows\n") ## ----missing-values----------------------------------------------------------- # # Check for missing values # missing_summary <- sapply(clean_data, function(x) sum(is.na(x))) # missing_table <- data.frame( # Variable = names(missing_summary), # Missing_Count = missing_summary, # Missing_Percentage = round(missing_summary / nrow(clean_data) * 100, 2) # ) # knitr::kable(missing_table, caption = "Missing value summary") # # # For demonstration, let's introduce some missing values # clean_data$Income[sample(1:nrow(clean_data), min(5, nrow(clean_data)))] <- NA # clean_data$Education[sample(1:nrow(clean_data), min(3, nrow(clean_data)))] <- NA # # # Clean missing values using mean imputation for Income # clean_data_income <- clean_missing(clean_data, "Income", method = "mean") # # # Clean missing values using mode imputation for Education # clean_data_final <- clean_missing(clean_data_income, "Education", method = "mode") # # # Verify missing values are handled # cat("Missing values after cleaning:\n") # print(sapply(clean_data_final, function(x) sum(is.na(x)))) ## ----standardize-categories--------------------------------------------------- # # Check current gender categories # cat("Current gender categories:\n") # print(table(clean_data_final$Gender)) # # # For demonstration, let's create some inconsistent categories # clean_data_final$Gender[sample(1:nrow(clean_data_final), min(8, nrow(clean_data_final)))] <- "M" # clean_data_final$Gender[sample(1:nrow(clean_data_final), min(7, nrow(clean_data_final)))] <- "F" # # # Standardize gender categories # gender_mapping <- list( # "M" = "Male", "Male" = "Male", # "F" = "Female", "Female" = "Female" # ) # # clean_data_standardized <- standardize_categories(clean_data_final, "Gender", gender_mapping) # # # Verify standardization # cat("Gender categories after standardization:\n") # print(table(clean_data_standardized$Gender)) ## ----apply-weights------------------------------------------------------------ # # Check weight distribution # cat("Weight summary:\n") # print(summary(clean_data_standardized$Weight)) # # # Apply normalized weights # weighted_data <- apply_weights(clean_data_standardized, "Weight") # # # Compare weight distributions # cat("Original weight sum:", sum(clean_data_standardized$Weight), "\n") # cat("Normalized weight sum:", sum(weighted_data$Weight), "\n") # cat("Sample size:", nrow(weighted_data), "\n") ## ----weighted-stats----------------------------------------------------------- # # Calculate unweighted and weighted means for Income # unweighted_mean_income <- mean(weighted_data$Income, na.rm = TRUE) # weighted_mean_income <- weighted_mean(weighted_data, "Income", "Weight") # # cat("Unweighted mean income:", round(unweighted_mean_income, 2), "\n") # cat("Weighted mean income:", round(weighted_mean_income, 2), "\n") # # # Calculate weighted total income # total_income <- weighted_total(weighted_data, "Income", "Weight") # cat("Weighted total income:", round(total_income, 0), "\n") ## ----raking-weights, eval = FALSE--------------------------------------------- # # Create population targets for raking # population_targets <- list( # Gender = c(Male = 1000000, Female = 1050000), # Education = c(`High School` = 800000, Bachelor = 900000, Graduate = 350000) # ) # # # Apply raking (note: this is a simplified demonstration) # weighted_data <- rake_weights(weighted_data, population_targets, "Weight") # # # Compare weight distributions before and after raking # cat("Weight statistics before raking:\n") # print(summary(weighted_data$Weight)) # # cat("Weight statistics after raking:\n") # print(summary(weighted_data$Weight)) ## ----descriptive-stats, eval = FALSE------------------------------------------ # # Generate comprehensive survey description # survey_description <- describe_survey(weighted_data, weight_col = "Weight") # # # Display sample information # cat("Sample Size:", survey_description$Sample_Size, "\n") # cat("Number of Variables:", survey_description$Number_Variables, "\n") # # # Display numeric variable statistics # if (!is.null(survey_description$Numeric_Statistics)) { # knitr::kable(survey_description$Numeric_Statistics, # caption = "Numeric variable statistics (including weighted means)") # } # # # Display categorical variable statistics for Gender # if (!is.null(survey_description$Categorical_Statistics$Gender)) { # knitr::kable(survey_description$Categorical_Statistics$Gender, # caption = "Gender distribution (unweighted and weighted)") # } ## ----frequency-tables--------------------------------------------------------- # # Generate frequency table for Education # education_freq <- frequency_table(weighted_data, "Education", "Weight") # knitr::kable(education_freq, caption = "Education frequency distribution (weighted)") # # # Generate frequency table for Gender (unweighted) # gender_freq_unweighted <- frequency_table(weighted_data, "Gender") # knitr::kable(gender_freq_unweighted, caption = "Gender frequency distribution (unweighted)") ## ----cross-tabulation--------------------------------------------------------- # # Create cross-tabulation between Gender and Education # cross_tab <- cross_tabulation(weighted_data, "Gender", "Education", "Weight") # # # Display cross-tabulation table # cat("Cross-tabulation table (unweighted):\n") # print(cross_tab$Cross_Table) # # # Display chi-square test results # cat("\nChi-square test results:\n") # cat("Statistic:", cross_tab$Chi_Square_Test$Statistic, "\n") # cat("Degrees of freedom:", cross_tab$Chi_Square_Test$DF, "\n") # cat("P-value:", cross_tab$Chi_Square_Test$P_Value, "\n") # cat("Method:", cross_tab$Chi_Square_Test$Method, "\n") # # # Display weighted cross-tabulation # if (cross_tab$Weighted) { # cat("\nWeighted cross-tabulation table:\n") # print(cross_tab$Weighted_Cross_Table) # } ## ----histogram-age------------------------------------------------------------ # # Create histogram for Age # age_histogram <- plot_histogram(weighted_data, "Age", bins = 25, add_density = TRUE) # print(age_histogram) ## ----bar-education------------------------------------------------------------ # # Create weighted bar plot for Education # education_bar <- plot_weighted_bar(weighted_data, "Education", "Weight", show_percentages = TRUE) # print(education_bar) ## ----boxplot-income----------------------------------------------------------- # # Create box plot of Income by Gender # income_boxplot <- plot_boxplot(weighted_data, "Income", "Gender", add_points = TRUE) # print(income_boxplot) ## ----boxplot-age-------------------------------------------------------------- # # Create box plot for Age only # age_boxplot <- plot_boxplot(weighted_data, "Age", add_points = TRUE) # print(age_boxplot) ## ----table1------------------------------------------------------------------- # # Create publication-ready table # table1_data <- data.frame( # Variable = c("Age", "Income", "Gender (Male)", "Education (College+)"), # Unweighted = c( # paste0(round(mean(weighted_data$Age, na.rm = TRUE), 1), " (", # round(sd(weighted_data$Age, na.rm = TRUE), 1), ")"), # paste0("$", round(mean(weighted_data$Income, na.rm = TRUE), 0)), # paste0(sum(weighted_data$Gender == "Male"), " (", # round(mean(weighted_data$Gender == "Male") * 100, 1), "%)"), # paste0(sum(weighted_data$Education %in% c("Bachelor", "Graduate")), " (", # round(mean(weighted_data$Education %in% c("Bachelor", "Graduate")) * 100, 1), "%)") # ), # Weighted = c( # paste0(round(weighted_mean(weighted_data, "Age", "Weight"), 1), " (", # "SD not calculated)"), # paste0("$", round(weighted_mean(weighted_data, "Income", "Weight"), 0)), # paste0(round(survey_description$Categorical_Statistics$Gender$Weighted_Percentage[ # survey_description$Categorical_Statistics$Gender$Category == "Male"], 1), "%"), # paste0(round( # sum(survey_description$Categorical_Statistics$Education$Weighted_Percentage[ # survey_description$Categorical_Statistics$Education$Category %in% c("Bachelor", "Graduate")]), 1), "%") # ) # ) # # knitr::kable(table1_data, caption = "Table 1: Sample Characteristics (Unweighted vs Weighted)") ## ----table2------------------------------------------------------------------- # # Create cross-tabulation results table # table2_data <- data.frame( # Category = c("Chi-square statistic", "Degrees of freedom", "P-value"), # Value = c( # round(cross_tab$Chi_Square_Test$Statistic, 3), # cross_tab$Chi_Square_Test$DF, # format.pval(cross_tab$Chi_Square_Test$P_Value, digits = 3) # ) # ) # # knitr::kable(table2_data, caption = "Table 2: Chi-square Test of Independence (Gender vs Education)") ## ----reproducibility---------------------------------------------------------- # # Display session information for reproducibility # session_info <- sessionInfo() # cat("R version:", session_info$R.version$version, "\n") # cat("Platform:", session_info$R.version$platform, "\n") # cat("Packages loaded:\n") # print(names(session_info$loadedOnly)) ## ----final-check-------------------------------------------------------------- # # Final data quality check # cat("Final dataset quality:\n") # cat("Rows:", nrow(weighted_data), "\n") # cat("Columns:", ncol(weighted_data), "\n") # cat("Missing values:", sum(is.na(weighted_data)), "\n") # cat("Weight sum:", sum(weighted_data$Weight, na.rm = TRUE), "\n") # cat("Analysis completed successfully!\n")