--- title: "FakeDataR: Getting started" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{FakeDataR: Getting started} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE) set.seed(1) # ensure RNG is initialized for clean vignette sessions library(FakeDataR) ``` This vignette shows how to mirror the structure of real data with fully synthetic values, verify the structure, and produce an LLM-ready bundle. ## Quick start ```{r tiny-end-to-end} # tiny input with a few likely sensitive fields df <- data.frame( id = sprintf("id%03d", 1:10), email = paste0("a", 1:10, "@x.com"), Progress = paste0(sample(80:100, 10, TRUE), "%"), check.names = FALSE ) orig <- prepare_input_data(df) fake_priv <- generate_fake_with_privacy( data = orig, n = 10, level = "low", seed = 1, sensitive = c("id", "email"), sensitive_detect = TRUE, sensitive_strategy = "fake", normalize = TRUE ) # quick validation sample head(validate_fake(orig, fake_priv), 5) ``` ```{r} library(FakeDataR) # Basic fake from a data.frame fake_mtc <- generate_fake_data(mtcars, n = 200, seed = 1) validate_fake(mtcars, fake_mtc) ``` ## Factors, characters, and numerics ```{r} fake_co2 <- generate_fake_data(as.data.frame(CO2), n = 200, seed = 2) validate_fake(as.data.frame(CO2), fake_co2) ``` ```{r} fake_tg <- generate_fake_data(ToothGrowth, n = 120, seed = 3) validate_fake(ToothGrowth, fake_tg) ``` ## Dates and POSIXct (time zones preserved) ```{r} df_date <- data.frame(d = seq(as.Date("2020-01-01"), by = "day", length.out = 50)) fake_date <- generate_fake_data(df_date, n = 80, seed = 4) str(fake_date$d) ``` ```{r} dt <- data.frame( when = seq.POSIXt(as.POSIXct("2023-05-01 00:00:00", tz = "America/New_York"), by = "hour", length.out = 200) ) fake_dt <- generate_fake_data(dt, n = 50, seed = 5) str(fake_dt$when) range(fake_dt$when) ``` ## Public datasets - wrap in guards, trim sizes These chunks run only if the packages are installed. ```{r flights-demo, message=FALSE} if (requireNamespace("nycflights13", quietly = TRUE)) { fl <- nycflights13::flights set.seed(10) fl_small <- fl[sample.int(nrow(fl), 2000), ] # smaller fake_fl <- generate_fake_data( fl_small, n = 500, seed = 10, numeric_mode = "distribution" ) head(validate_fake(fl_small, fake_fl), 5) } else { message("nycflights13 not installed - skipping.") } ``` ```{r penguins-demo, message=FALSE} if (requireNamespace("palmerpenguins", quietly = TRUE)) { peng <- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")]) fake_peng <- generate_fake_data( peng, n = 400, seed = 11, category_mode = "preserve" ) head(validate_fake(peng, fake_peng), 5) } else { message("palmerpenguins not installed - skipping.") } ``` ## Gapminder demo ```{r gapminder-demo, message=FALSE} # Optional package; make the chunk robust if (requireNamespace("gapminder", quietly = TRUE)) { set.seed(21) gm <- gapminder::gapminder # Keep it light if you want: gm <- gm[sample.int(nrow(gm), 2000), ] fake_gm <- generate_fake_data( gm, n = 800, seed = 21, numeric_mode = "distribution", # nicer numeric spread category_mode = "preserve" # keep factor levels ) validate_fake(gm, fake_gm) } else { message("gapminder not installed; skipping demo.") } ``` ## Sensitive columns: fake vs drop ```{r pii-demo, message=FALSE} set.seed(12) df_pii <- data.frame( id = 1:100, email = sprintf("user%03d@corp.com", 1:100), phone = sprintf("(415) 555-%04d", 1:100), spend = runif(100, 10, 500) ) fake_keep <- generate_fake_data( df_pii, n = 120, sensitive_detect = TRUE, sensitive_strategy = "fake" ) fake_drop <- generate_fake_data( df_pii, n = 120, sensitive_detect = TRUE, sensitive_strategy = "drop" ) names(fake_keep) # expect id/email/phone present but synthetic names(fake_drop) # expect only "spend" ``` ## LLM bundle: data + schema + README (+ optional ZIP) ```{r} b1 <- llm_bundle( data = ToothGrowth, n = 150, level = "high", seed = 10, formats = c("csv","rds"), path = tempdir(), filename = "toothgrowth_fake", write_prompt = TRUE, zip = TRUE ) b1$schema_path b1$readme_path b1$zip_path ``` ## Parquet export (optional) ```{r parquet-export, message=FALSE} if (requireNamespace("arrow", quietly = TRUE)) { fake_air <- generate_fake_data(airquality, n = 400, seed = 20) export_fake(fake_air, file.path(tempdir(), "air.parquet")) } else { message("arrow not installed - skipping Parquet export.") } ``` ## Reproducibility ```{r} a1 <- generate_fake_data(CO2, n = 123, seed = 42) a2 <- generate_fake_data(CO2, n = 123, seed = 42) identical(a1, a2) ``` ```{r big-benchmark, eval=FALSE} big <- data.frame( a = runif(2e5), b = sample(letters, 2e5, TRUE), c = as.Date("2020-01-01") + sample.int(3000, 2e5, TRUE) ) system.time({ fake_big <- generate_fake_data(big, n = 2e5, seed = 99) }) ```