ympes provides a collection of functions for working with age intervals with underlying implementations that have been optimised for performance.
ages_to_interval()
ages_to_interval()
provides categorisation of ages based on specified right-hand interval limits. The resultant groupings will span the natural numbers (from 0) and will always be closed on the left and open on the right. For example, if limits = c(1,10,30)
the possible groupings will be “[0, 1)”, “[1, 10)”, “[10, 30)” and “[30, Inf)”. This is roughly comparable to a call of cut(ages, right = FALSE, breaks = c(0, limits))
but with the start and end points of the interval returned as entries in a list.
Note that the default limits
argument in all functions is set to limits = c(1L, 5L, 15L, 25L, 45L, 65L)
but for clarity in examples we will explicitly set this value where used.
library(ympes)
ages_to_interval(ages = 0:9, limits = c(1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound
#> 1 [0, 1) 0 1
#> 2 [1, 5) 1 5
#> 3 [1, 5) 1 5
#> 4 [1, 5) 1 5
#> 5 [1, 5) 1 5
#> 6 [5, 15) 5 15
#> 7 [5, 15) 5 15
#> 8 [5, 15) 5 15
#> 9 [5, 15) 5 15
#> 10 [5, 15) 5 15
ages_to_interval(1:10, limits = 6L)
#> interval lower_bound upper_bound
#> 1 [0, 6) 0 6
#> 2 [0, 6) 0 6
#> 3 [0, 6) 0 6
#> 4 [0, 6) 0 6
#> 5 [0, 6) 0 6
#> 6 [6, Inf) 6 Inf
#> 7 [6, Inf) 6 Inf
#> 8 [6, Inf) 6 Inf
#> 9 [6, Inf) 6 Inf
#> 10 [6, Inf) 6 Inf
str(x <- ages_to_interval(1:100))
#> 'data.frame': 100 obs. of 3 variables:
#> $ interval : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#> $ lower_bound: num 1 1 1 1 5 5 5 5 5 5 ...
#> $ upper_bound: num 5 5 5 5 15 15 15 15 15 15 ...
head(x$interval)
#> [1] [1, 5) [1, 5) [1, 5) [1, 5) [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)
split_interval_counts()
split_interval_counts()
splits counts within a age interval in to counts for individuals years based on a given weighting. Age intervals are specified by their lower (closed) and upper (open) bounds, i.e. intervals of the form [lower, upper).
# by default counts are split equally across ages within intervals
split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L)
)#> age count
#> 1 0 1
#> 2 1 1
#> 3 2 1
#> 4 3 1
#> 5 4 1
#> 6 5 2
#> 7 6 2
#> 8 7 2
#> 9 8 2
#> 10 9 2
#> 11 10 3
#> 12 11 3
#> 13 12 3
#> 14 13 3
#> 15 14 3
#> 16 15 3
#> 17 16 3
#> 18 17 3
#> 19 18 3
#> 20 19 3
# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
<- 20L
max_upper <- integer(max_upper)
weights c(TRUE, FALSE)] <- 1L
weights[split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L),
max_upper = max_upper,
<- weights
weights
)#> age count
#> 1 0 1.666667
#> 2 1 0.000000
#> 3 2 1.666667
#> 4 3 0.000000
#> 5 4 1.666667
#> 6 5 0.000000
#> 7 6 5.000000
#> 8 7 0.000000
#> 9 8 5.000000
#> 10 9 0.000000
#> 11 10 6.000000
#> 12 11 0.000000
#> 13 12 6.000000
#> 14 13 0.000000
#> 15 14 6.000000
#> 16 15 0.000000
#> 17 16 6.000000
#> 18 17 0.000000
#> 19 18 6.000000
#> 20 19 0.000000
aggregate_age_counts()
aggregate_age_counts()
provides aggregation of counts across ages (in years). It is similar to a cut()
and tapply()
pattern but optimised for speed over flexibility. Groupings are the same as in ages_to_interval()
and counts will be provided across all natural numbers as well as for missing values.
# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, limits = c(1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 1
#> 2 [1, 5) 1 5 14
#> 3 [5, 15) 5 15 105
#> 4 [15, 25) 15 25 205
#> 5 [25, 45) 25 45 710
#> 6 [45, 65) 45 65 1110
#> 7 [65, Inf) 65 Inf 0
#> 8 <NA> NA NA 0
aggregate_age_counts(counts = 1:65, limits = 50)
#> interval lower_bound upper_bound count
#> 1 [0, 50) 0 50 1275
#> 2 [50, Inf) 50 Inf 870
#> 3 <NA> NA NA 0
# NA ages are handled with their own grouping
<- 1:65;
ages 1:44] <- NA
ages[aggregate_age_counts(
counts = 1:65,
ages = ages,
limits = c(1L, 5L, 15L, 25L, 45L, 65L)
)#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 0
#> 2 [1, 5) 1 5 0
#> 3 [5, 15) 5 15 0
#> 4 [15, 25) 15 25 0
#> 5 [25, 45) 25 45 0
#> 6 [45, 65) 45 65 1090
#> 7 [65, Inf) 65 Inf 65
#> 8 <NA> NA NA 990
reaggregate_interval_counts()
reaggregate_interval_counts()
is equivalent to, but more efficient than, calling split_interval_counts()
and then aggregate_age_counts()
.
The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.
# census data
data(pop_dat)
pop_dat#> area_code area_name age_category value
#> 1 K04000001 England and Wales [0, 5) 3232100
#> 2 K04000001 England and Wales [5, 10) 3524600
#> 3 K04000001 England and Wales [10, 15) 3595900
#> 4 K04000001 England and Wales [15, 20) 3394700
#> 5 K04000001 England and Wales [20, 25) 3602100
#> 6 K04000001 England and Wales [25, 30) 3901800
#> 7 K04000001 England and Wales [30, 35) 4148800
#> 8 K04000001 England and Wales [35, 40) 3981600
#> 9 K04000001 England and Wales [40, 45) 3755700
#> 10 K04000001 England and Wales [45, 50) 3788700
#> 11 K04000001 England and Wales [50, 55) 4123400
#> 12 K04000001 England and Wales [55, 60) 4029000
#> 13 K04000001 England and Wales [60, 65) 3455700
#> 14 K04000001 England and Wales [65, 70) 2945100
#> 15 K04000001 England and Wales [70, 75) 2978000
#> 16 K04000001 England and Wales [75, 80) 2170300
#> 17 K04000001 England and Wales [80, 85) 1517000
#> 18 K04000001 England and Wales [85, 90) 925100
#> 19 K04000001 England and Wales [90, Inf) 527900
# each row is for the same region so discard for moment
<- subset(pop_dat, select = c(age_category, value))
dat
# extract upper and lower bounds
<- transform(
dat
dat,lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)
head(dat, n =10)
#> age_category value lower_bound upper_bound
#> 1 [0, 5) 3232100 0 5
#> 2 [5, 10) 3524600 5 10
#> 3 [10, 15) 3595900 10 15
#> 4 [15, 20) 3394700 15 20
#> 5 [20, 25) 3602100 20 25
#> 6 [25, 30) 3901800 25 30
#> 7 [30, 35) 4148800 30 35
#> 8 [35, 40) 3981600 35 40
#> 9 [40, 45) 3755700 40 45
#> 10 [45, 50) 3788700 45 50
# recategorise based on ages
with(
dat, reaggregate_interval_counts(
lower_bounds = lower_bound,
upper_bounds = upper_bound,
counts = value,
limits = c(1L, 5L, 15L, 25L, 45L, 65L),
max_upper = 100L,
weights = NULL
)
)#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 646420
#> 2 [1, 5) 1 5 2585680
#> 3 [5, 15) 5 15 7120500
#> 4 [15, 25) 15 25 6996800
#> 5 [25, 45) 25 45 15787900
#> 6 [45, 65) 45 65 15396800
#> 7 [65, Inf) 65 Inf 11063400
#> 8 <NA> NA NA 0