## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup--------------------------------------------------------------------
library(leakr)

## ----basic_example------------------------------------------------------------
# Load the iris dataset
data(iris)

# Run a basic audit
report <- leakr_audit(iris, target = "Species")

# View the summary
print(report)

## ----examine_report-----------------------------------------------------------
# Get a detailed summary
summary_report <- leakr_summarise(report, top_n = 5, show_config = TRUE)
print(summary_report)

## ----train_test_example-------------------------------------------------------
# Create a dataset with potential train/test leakage
set.seed(123)
n <- 1000

# Simulate a dataset
data <- data.frame(
  feature1 = rnorm(n),
  feature2 = rnorm(n),
  feature3 = rnorm(n),
  target = factor(sample(c("A", "B"), n, replace = TRUE))
)

# Create a train/test split
train_indices <- sample(1:n, 0.7 * n)
split_vector <- rep("test", n)
split_vector[train_indices] <- "train"

# Run audit with split information
report_with_split <- leakr_audit(
  data = data,
  target = "target",
  split = split_vector
)

print(report_with_split)

## ----target_leakage-----------------------------------------------------------
# Create data with obvious target leakage
leaky_data <- data.frame(
  legitimate_feature = rnorm(100),
  target = factor(sample(c("yes", "no"), 100, replace = TRUE)),
  stringsAsFactors = FALSE
)

# Add a leaky feature (perfectly correlated with target)
leaky_data$leaky_feature <- ifelse(leaky_data$target == "yes", 1, 0)

# Audit for target leakage
leakage_report <- leakr_audit(leaky_data, target = "target")
print(leakage_report)

## ----duplication--------------------------------------------------------------
# Create data with duplicates
original_data <- mtcars[1:20, ]
duplicated_data <- rbind(original_data, original_data[1:5, ])

# Add row identifiers
duplicated_data$id <- 1:nrow(duplicated_data)

# Run duplication audit
dup_report <- leakr_audit(
  data = duplicated_data, 
  target = "mpg",
  id = "id"
)

print(dup_report)

## ----configuration------------------------------------------------------------
# Custom configuration
custom_config <- list(
  sample_size = 10000,        # Limit analysis to 10k rows for large datasets
  correlation_threshold = 0.9, # Adjust sensitivity for correlation-based detectors
  duplicate_threshold = 0.95   # Threshold for near-duplicate detection
)

# Run audit with custom configuration
configured_report <- leakr_audit(
  data = iris,
  target = "Species",
  config = custom_config
)

print(configured_report)

## ----visualisation, eval=FALSE------------------------------------------------
# # Generate diagnostic plots
# plots <- generate_diagnostic_plots(report)
# 
# # Display plots (if available)
# if (!is.null(plots)) {
#   plot(plots)
# }

## ----large_dataset_simulation-------------------------------------------------
# Simulate a large dataset
set.seed(42)
large_n <- 50000

large_data <- data.frame(
  feature1 = rnorm(large_n),
  feature2 = rnorm(large_n),
  feature3 = sample(letters[1:5], large_n, replace = TRUE),
  target = factor(sample(c("positive", "negative"), large_n, replace = TRUE))
)

# leakr will automatically sample this dataset
large_report <- leakr_audit(large_data, target = "target")
print(large_report)