## ----setup, include=FALSE----------------------------------------------------- library(dplyr) library(tidyr) library(dtrackr) knitr::opts_chunk$set(echo = TRUE) ## ----------------------------------------------------------------------------- age_cats = factor(sprintf("%02d-%02d",seq(0,80,5),seq(4,84,5))) # A set of synthetic patients: patients = tibble::tibble( patient_id = 1:100, age_category = sample(age_cats,100, replace=TRUE), ethnicity = sample(1:6, 100, replace = TRUE), gender = sample(c("Male","Female"), 100, replace=TRUE), group = sample(c("Cases","Controls"), 100, replace=TRUE) ) # each patient is going to have a random selection of tests tests = tibble::tibble( test_id = 1:1000, patient_id = sample(1:100,1000, replace = TRUE), test_type = sample(c("FBC","LFT","Electrolytes"), 1000, replace=TRUE), test_date = as.Date("2025-01-01")+sample.int(50, 1000, replace=TRUE) ) # and each test a random selection of results consisting of components and # values: tests = tests %>% mutate( result = purrr::map(test_type, ~ case_when( .x == "FBC" ~ list(tibble::tibble( component = c("HB","platelets","WCC"), value = c( runif(1,13.5,15), runif(1,100,1000), runif(1,0,30)) )), .x == "LFT" ~ list(tibble::tibble( component = c("AST","GGT"), value = c( runif(1,0,100), runif(1,0,100)) )), .x == "Electrolytes" ~ list(tibble::tibble( component = c("NA","K","Glucose"), value = c( runif(1,130,150), runif(1,3.3,5.2), runif(1,50,150)) )) )) ) data = patients %>% inner_join( tests %>% unnest(result) %>% unnest(result), by="patient_id" ) data %>% glimpse() ## ----------------------------------------------------------------------------- processed = data %>% # the data is originally long format with one row per test result: track("{.count} test results") %>% mutate(maybe_diabetic = any(component == "Glucose" & value>130), .by = patient_id) %>% nest(test_panel = c(component,value), .messages="") %>% # Now the data is long format with one row per test: comment("{.count} tests") %>% nest(tests = starts_with("test_"), .messages="") %>% # and now long format with one row per patient: comment("{.count} patients") %>% group_by(group) %>% comment("{.count} patients") %>% # these exclusions are at the patient level exclude_all( .headline = "people", maybe_diabetic ~ "{.excluded} diabetics", age_category %in% age_cats[1:4] ~ "{.excluded} under 20" ) %>% # these are now back at the test level unnest(tests) %>% comment("{.count} tests",.headline = "") %>% exclude_all( .headline = "tests", test_date < "2025-01-07" ~ "{.excluded} with invalid dates" ) %>% count_subgroup(test_type, .headline = "") %>% # and finally at the granular test result level unnest(test_panel) %>% exclude_all( .headline = "results", component == "HB" & value < 14 ~ "{.excluded} invalid Hb results", component == "K" & value < 3.5 ~ "{.excluded} haemolysed K+" ) %>% group_by(test_type, .add=TRUE, .messages="By tests") %>% count_subgroup(component, .headline = "{test_type}") %>% ungroup(.messages = "{.count} eligible results") %>% nest(test_panel = c(component,value), .messages="") %>% comment("{.count} eligible tests") %>% nest(tests = starts_with("test_"), .messages="") %>% comment("{.count} eligible patients") processed %>% flowchart() ## ----------------------------------------------------------------------------- data %>% group_by(age_category, gender, group) %>% summarise( n = n_distinct(patient_id) ) %>% pivot_wider(values_from = n, names_from = group) %>% filter(abs(Cases-Controls) <= 1) %>% glimpse() ## ----------------------------------------------------------------------------- old = options(dtrackr.verbose=TRUE) data %>% track() %>% group_by(gender) %>% comment(c("{.count} items","before pause")) %>% # the tracking is paused on this next step as the number of groups becomes >16 group_by(age_category, group, .add=TRUE) %>% comment("This message is not tracked") %>% summarise( n = n_distinct(patient_id) ) %>% pivot_wider(values_from = n, names_from = group) %>% filter(abs(Cases-Controls) <= 1) %>% # the tracking is automatically resumed at this point as the grouping has # returned to manageable levels. group_by(gender) %>% comment(c("{.count} summarised rows","after resume")) %>% flowchart() options(old)