## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----setup-------------------------------------------------------------------- library(leakr) ## ----basic_example------------------------------------------------------------ # Load the iris dataset data(iris) # Run a basic audit report <- leakr_audit(iris, target = "Species") # View the summary print(report) ## ----examine_report----------------------------------------------------------- # Get a detailed summary summary_report <- leakr_summarise(report, top_n = 5, show_config = TRUE) print(summary_report) ## ----train_test_example------------------------------------------------------- # Create a dataset with potential train/test leakage set.seed(123) n <- 1000 # Simulate a dataset data <- data.frame( feature1 = rnorm(n), feature2 = rnorm(n), feature3 = rnorm(n), target = factor(sample(c("A", "B"), n, replace = TRUE)) ) # Create a train/test split train_indices <- sample(1:n, 0.7 * n) split_vector <- rep("test", n) split_vector[train_indices] <- "train" # Run audit with split information report_with_split <- leakr_audit( data = data, target = "target", split = split_vector ) print(report_with_split) ## ----target_leakage----------------------------------------------------------- # Create data with obvious target leakage leaky_data <- data.frame( legitimate_feature = rnorm(100), target = factor(sample(c("yes", "no"), 100, replace = TRUE)), stringsAsFactors = FALSE ) # Add a leaky feature (perfectly correlated with target) leaky_data$leaky_feature <- ifelse(leaky_data$target == "yes", 1, 0) # Audit for target leakage leakage_report <- leakr_audit(leaky_data, target = "target") print(leakage_report) ## ----duplication-------------------------------------------------------------- # Create data with duplicates original_data <- mtcars[1:20, ] duplicated_data <- rbind(original_data, original_data[1:5, ]) # Add row identifiers duplicated_data$id <- 1:nrow(duplicated_data) # Run duplication audit dup_report <- leakr_audit( data = duplicated_data, target = "mpg", id = "id" ) print(dup_report) ## ----configuration------------------------------------------------------------ # Custom configuration custom_config <- list( sample_size = 10000, # Limit analysis to 10k rows for large datasets correlation_threshold = 0.9, # Adjust sensitivity for correlation-based detectors duplicate_threshold = 0.95 # Threshold for near-duplicate detection ) # Run audit with custom configuration configured_report <- leakr_audit( data = iris, target = "Species", config = custom_config ) print(configured_report) ## ----visualisation, eval=FALSE------------------------------------------------ # # Generate diagnostic plots # plots <- generate_diagnostic_plots(report) # # # Display plots (if available) # if (!is.null(plots)) { # plot(plots) # } ## ----large_dataset_simulation------------------------------------------------- # Simulate a large dataset set.seed(42) large_n <- 50000 large_data <- data.frame( feature1 = rnorm(large_n), feature2 = rnorm(large_n), feature3 = sample(letters[1:5], large_n, replace = TRUE), target = factor(sample(c("positive", "negative"), large_n, replace = TRUE)) ) # leakr will automatically sample this dataset large_report <- leakr_audit(large_data, target = "target") print(large_report)