Data leakage is one of the most insidious problems in machine learning, where information from the future or target variable inadvertently influences model training. The leakr package provides a comprehensive toolkit for detecting common leakage patterns that can compromise model validity and reproducibility.
This vignette demonstrates the basic functionality of leakr using standard datasets, showing how to identify and diagnose potential leakage issues in your machine learning workflows.
The primary interface for leakage detection is
leakr_audit(), which runs multiple detectors on your
dataset and generates a comprehensive report.
# Load the iris dataset
data(iris)
# Run a basic audit
report <- leakr_audit(iris, target = "Species")
# View the summary
print(report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 150 5
#>
#> $meta$original_data_shape
#> [1] 150 5
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"The audit report contains several key components:
# Get a detailed summary
summary_report <- leakr_summarise(report, top_n = 5, show_config = TRUE)
#> Leakage Audit Report
#> ===================
#> Data shape: 150 x 5
#> Detectors run:
#> Timestamp: 2025-10-22 10:43:41
#>
#> ✓ No leakage issues detected.
print(summary_report)
#> data frame with 0 columns and 0 rowsOne of the most common sources of leakage occurs when information from the test set influences training. Let’s create a more realistic example:
# Create a dataset with potential train/test leakage
set.seed(123)
n <- 1000
# Simulate a dataset
data <- data.frame(
feature1 = rnorm(n),
feature2 = rnorm(n),
feature3 = rnorm(n),
target = factor(sample(c("A", "B"), n, replace = TRUE))
)
# Create a train/test split
train_indices <- sample(1:n, 0.7 * n)
split_vector <- rep("test", n)
split_vector[train_indices] <- "train"
# Run audit with split information
report_with_split <- leakr_audit(
data = data,
target = "target",
split = split_vector
)
print(report_with_split)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 1000 4
#>
#> $meta$original_data_shape
#> [1] 1000 4
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"Target leakage occurs when features contain information that would not be available at prediction time:
# Create data with obvious target leakage
leaky_data <- data.frame(
legitimate_feature = rnorm(100),
target = factor(sample(c("yes", "no"), 100, replace = TRUE)),
stringsAsFactors = FALSE
)
# Add a leaky feature (perfectly correlated with target)
leaky_data$leaky_feature <- ifelse(leaky_data$target == "yes", 1, 0)
# Audit for target leakage
leakage_report <- leakr_audit(leaky_data, target = "target")
print(leakage_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 100 3
#>
#> $meta$original_data_shape
#> [1] 100 3
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"Duplicate records can lead to optimistic performance estimates:
# Create data with duplicates
original_data <- mtcars[1:20, ]
duplicated_data <- rbind(original_data, original_data[1:5, ])
# Add row identifiers
duplicated_data$id <- 1:nrow(duplicated_data)
# Run duplication audit
dup_report <- leakr_audit(
data = duplicated_data,
target = "mpg",
id = "id"
)
print(dup_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 25 12
#>
#> $meta$original_data_shape
#> [1] 25 12
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"The leakr_audit() function accepts various configuration
options to customise the detection process:
# Custom configuration
custom_config <- list(
sample_size = 10000, # Limit analysis to 10k rows for large datasets
correlation_threshold = 0.9, # Adjust sensitivity for correlation-based detectors
duplicate_threshold = 0.95 # Threshold for near-duplicate detection
)
# Run audit with custom configuration
configured_report <- leakr_audit(
data = iris,
target = "Species",
config = custom_config
)
print(configured_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 150 5
#>
#> $meta$original_data_shape
#> [1] 150 5
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 10000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.9
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#> $meta$config_used$duplicate_threshold
#> [1] 0.95
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"Generate diagnostic plots to better understand detected issues:
For large datasets, leakr automatically applies intelligent sampling to maintain performance while preserving detection accuracy:
# Simulate a large dataset
set.seed(42)
large_n <- 50000
large_data <- data.frame(
feature1 = rnorm(large_n),
feature2 = rnorm(large_n),
feature3 = sample(letters[1:5], large_n, replace = TRUE),
target = factor(sample(c("positive", "negative"), large_n, replace = TRUE))
)
# leakr will automatically sample this dataset
large_report <- leakr_audit(large_data, target = "target")
print(large_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 50000 4
#>
#> $meta$original_data_shape
#> [1] 50000 4
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:41 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"This vignette covered the basics of using leakr for data leakage detection. For more advanced usage, including:
See the other vignettes in this package:
The leakr package provides a systematic approach to detecting data leakage in machine learning workflows. Key takeaways:
leakr_audit() as your primary entry point for
leakage detectionRegular use of leakr can help ensure the integrity and reproducibility of your machine learning models.