## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(keyed) library(dplyr) set.seed(42) ## ----------------------------------------------------------------------------- # January export: clean data january <- data.frame( customer_id = c(101, 102, 103, 104, 105), email = c("alice@example.com", "bob@example.com", "carol@example.com", "dave@example.com", "eve@example.com"), segment = c("premium", "basic", "premium", "basic", "premium") ) # February export: corrupted upstream (duplicates + missing email) february <- data.frame( customer_id = c(101, 102, 102, 104, 105), # Note: 102 is duplicated email = c("alice@example.com", "bob@example.com", NA, "dave@example.com", "eve@example.com"), segment = c("premium", "basic", "basic", "basic", "premium") ) ## ----------------------------------------------------------------------------- head(february) nrow(february) # Same row count ## ----error=TRUE--------------------------------------------------------------- try({ # Define what you expect: customer_id is unique january_keyed <- january |> key(customer_id) |> lock_no_na(email) # This works - January data is clean january_keyed }) ## ----error=TRUE--------------------------------------------------------------- try({ # This fails immediately - duplicates detected february |> key(customer_id) }) ## ----------------------------------------------------------------------------- validate_customer_export <- function(df) { df |> key(customer_id) |> lock_no_na(email) |> lock_nrow(min = 1) } # January: passes january_clean <- validate_customer_export(january) summary(january_clean) ## ----------------------------------------------------------------------------- # Filter preserves key premium_customers <- january_clean |> filter(segment == "premium") has_key(premium_customers) get_key_cols(premium_customers) # Mutate preserves key enriched <- january_clean |> mutate(domain = sub(".*@", "", email)) has_key(enriched) ## ----error=TRUE--------------------------------------------------------------- try({ # This creates duplicates - keyed stops you january_clean |> mutate(customer_id = 1) }) ## ----------------------------------------------------------------------------- january_clean |> unkey() |> mutate(customer_id = 1) ## ----------------------------------------------------------------------------- customers <- data.frame( customer_id = 1:5, name = c("Alice", "Bob", "Carol", "Dave", "Eve"), tier = c("gold", "silver", "gold", "bronze", "silver") ) |> key(customer_id) orders <- data.frame( order_id = 1:8, customer_id = c(1, 1, 2, 3, 3, 3, 4, 5), amount = c(100, 150, 200, 50, 75, 125, 300, 80) ) |> key(order_id) ## ----------------------------------------------------------------------------- diagnose_join(customers, orders, by = "customer_id", use_joinspy = FALSE) ## ----------------------------------------------------------------------------- compare_keys(customers, orders) ## ----------------------------------------------------------------------------- # Add UUIDs to rows customers_tracked <- customers |> add_id() customers_tracked ## ----------------------------------------------------------------------------- # Filter: IDs persist gold_customers <- customers_tracked |> filter(tier == "gold") get_id(gold_customers) # Compare with original compare_ids(customers_tracked, gold_customers) ## ----------------------------------------------------------------------------- batch1 <- data.frame(x = 1:3) |> add_id() batch2 <- data.frame(x = 4:6) # No IDs yet # bind_id assigns new IDs to batch2 and checks for conflicts combined <- bind_id(batch1, batch2) combined ## ----------------------------------------------------------------------------- # Commit current state as reference reference_data <- data.frame( region_id = c("US", "EU", "APAC"), tax_rate = c(0.08, 0.20, 0.10) ) |> key(region_id) |> commit_keyed() ## ----------------------------------------------------------------------------- # No changes yet check_drift(reference_data) ## ----------------------------------------------------------------------------- # Simulate upstream change: EU tax rate changed modified_data <- reference_data modified_data$tax_rate[2] <- 0.21 # Drift detected! check_drift(modified_data) ## ----------------------------------------------------------------------------- # Remove snapshots when done clear_all_snapshots()