Skip to contents

ageutils provides a collection of functions for working with age intervals whose underlying implementations have been optimised for performance.

breaks_to_interval()

breaks_to_interval provides a categorisation based on specified breaks which represent left-hand interval limits. The resultant groupings span from the minimum break through to a specified max_upper and will always be closed on the left and open on the right. As an example, if breaks = c(0, 1, 10, 30) the interval categories would be [0, 1), [1, 10), [10, 30) and [30, Inf). Ages above max_upper will be returned as NA.

The returned value is as a data frame with 3 entries; A factor with a character representation of the interval and two columns representing the numeric values of the corresponding lower (closed) and upper (open) bounds.

library(ageutils)

breaks_to_interval(breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3   [5, 15)           5          15
#> 4  [15, 25)          15          25
#> 5  [25, 45)          25          45
#> 6  [45, 65)          45          65
#> 7 [65, Inf)          65         Inf

breaks_to_interval(breaks = c(1L, 5L, 15L), max_upper = 25L)
#>   interval lower_bound upper_bound
#> 1   [1, 5)           1           5
#> 2  [5, 15)           5          15
#> 3 [15, 25)          15          25

cut_ages()

cut_ages() provides categorisation of ages based on specified breaks which represent the left-hand interval limits. Categorisation is based on the breaks and follows the approach of breaks_to_interval.

cut_ages(ages = 0:9, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3    [1, 5)           1           5
#> 4    [1, 5)           1           5
#> 5    [1, 5)           1           5
#> 6   [5, 15)           5          15
#> 7   [5, 15)           5          15
#> 8   [5, 15)           5          15
#> 9   [5, 15)           5          15
#> 10  [5, 15)           5          15

cut_ages(1:10, breaks = c(0L, 4L), max_upper = 9L)
#>    interval lower_bound upper_bound
#> 1    [0, 4)           0           4
#> 2    [0, 4)           0           4
#> 3    [0, 4)           0           4
#> 4    [4, 9)           4           9
#> 5    [4, 9)           4           9
#> 6    [4, 9)           4           9
#> 7    [4, 9)           4           9
#> 8    [4, 9)           4           9
#> 9      <NA>          NA          NA
#> 10     <NA>          NA          NA

x <- cut_ages(1:100, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))

str(x)
#> 'data.frame':    100 obs. of  3 variables:
#>  $ interval   : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#>  $ lower_bound: num  1 1 1 1 5 5 5 5 5 5 ...
#>  $ upper_bound: num  5 5 5 5 15 15 15 15 15 15 ...

head(x$interval)
#> [1] [1, 5)  [1, 5)  [1, 5)  [1, 5)  [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)

split_interval_counts()

split_interval_counts() splits counts within a age interval in to counts for individuals years based on a given weighting. Age intervals are specified by their lower (closed) and upper (open) bounds, i.e. intervals of the form [lower, upper).

# by default counts are split equally across ages within intervals
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L)
)
#>    age count
#> 1    0     1
#> 2    1     1
#> 3    2     1
#> 4    3     1
#> 5    4     1
#> 6    5     2
#> 7    6     2
#> 8    7     2
#> 9    8     2
#> 10   9     2
#> 11  10     3
#> 12  11     3
#> 13  12     3
#> 14  13     3
#> 15  14     3
#> 16  15     3
#> 17  16     3
#> 18  17     3
#> 19  18     3
#> 20  19     3

# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
max_upper <- 20L
weights <- integer(max_upper)
weights[c(TRUE, FALSE)] <- 1L
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L),
    max_upper = max_upper,
    weights <- weights
)
#>    age    count
#> 1    0 1.666667
#> 2    1 0.000000
#> 3    2 1.666667
#> 4    3 0.000000
#> 5    4 1.666667
#> 6    5 0.000000
#> 7    6 5.000000
#> 8    7 0.000000
#> 9    8 5.000000
#> 10   9 0.000000
#> 11  10 6.000000
#> 12  11 0.000000
#> 13  12 6.000000
#> 14  13 0.000000
#> 15  14 6.000000
#> 16  15 0.000000
#> 17  16 6.000000
#> 18  17 0.000000
#> 19  18 6.000000
#> 20  19 0.000000

aggregate_age_counts()

aggregate_age_counts() provides aggregation of counts across ages (in years). It is similar to a cut() and tapply() pattern but optimised for speed over flexibility. Groupings are the same as in cut_ages() and counts will be provided across all natural numbers as well as for missing values.

# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     1
#> 2    [1, 5)           1           5    14
#> 3   [5, 15)           5          15   105
#> 4  [15, 25)          15          25   205
#> 5  [25, 45)          25          45   710
#> 6  [45, 65)          45          65  1110
#> 7 [65, Inf)          65         Inf     0

# NA ages are also handled with their own grouping
ages <- 1:65
ages[1:44] <- NA
aggregate_age_counts(
    counts = 1:65,
    ages = ages,
    breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L)
)
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     0
#> 2    [1, 5)           1           5     0
#> 3   [5, 15)           5          15     0
#> 4  [15, 25)          15          25     0
#> 5  [25, 45)          25          45     0
#> 6  [45, 65)          45          65  1090
#> 7 [65, Inf)          65         Inf    65
#> 8      <NA>          NA          NA   990

reaggregate_interval_counts()

reaggregate_interval_counts() is equivalent to, but more efficient than a call to to split_interval_counts() followed by aggregate_age_counts().

The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.

# census data
data(pop_dat)
pop_dat
#>    area_code         area_name age_category   value
#> 1  K04000001 England and Wales       [0, 5) 3232100
#> 2  K04000001 England and Wales      [5, 10) 3524600
#> 3  K04000001 England and Wales     [10, 15) 3595900
#> 4  K04000001 England and Wales     [15, 20) 3394700
#> 5  K04000001 England and Wales     [20, 25) 3602100
#> 6  K04000001 England and Wales     [25, 30) 3901800
#> 7  K04000001 England and Wales     [30, 35) 4148800
#> 8  K04000001 England and Wales     [35, 40) 3981600
#> 9  K04000001 England and Wales     [40, 45) 3755700
#> 10 K04000001 England and Wales     [45, 50) 3788700
#> 11 K04000001 England and Wales     [50, 55) 4123400
#> 12 K04000001 England and Wales     [55, 60) 4029000
#> 13 K04000001 England and Wales     [60, 65) 3455700
#> 14 K04000001 England and Wales     [65, 70) 2945100
#> 15 K04000001 England and Wales     [70, 75) 2978000
#> 16 K04000001 England and Wales     [75, 80) 2170300
#> 17 K04000001 England and Wales     [80, 85) 1517000
#> 18 K04000001 England and Wales     [85, 90)  925100
#> 19 K04000001 England and Wales    [90, Inf)  527900

# each row is for the same region so discard for moment
dat <- subset(pop_dat, select = c(age_category, value))

# extract upper and lower bounds
dat <- transform(
    dat,
    lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
    upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)

head(dat, n=10)
#>    age_category   value lower_bound upper_bound
#> 1        [0, 5) 3232100           0           5
#> 2       [5, 10) 3524600           5          10
#> 3      [10, 15) 3595900          10          15
#> 4      [15, 20) 3394700          15          20
#> 5      [20, 25) 3602100          20          25
#> 6      [25, 30) 3901800          25          30
#> 7      [30, 35) 4148800          30          35
#> 8      [35, 40) 3981600          35          40
#> 9      [40, 45) 3755700          40          45
#> 10     [45, 50) 3788700          45          50

# recategorise based on ages
with(
    dat,
    reaggregate_interval_counts(
        lower_bounds = lower_bound,
        upper_bounds = upper_bound,
        counts = value,
        breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L),
        max_upper = 100L,
        weights = NULL
    )
)
#> Warning in reaggregate_interval_counts(lower_bounds = lower_bound, upper_bounds
#> = upper_bound, : `upper_bounds` greater than `max_upper` (100) have been
#> replaced prior to splitting.
#>    interval lower_bound upper_bound    count
#> 1    [0, 1)           0           1   646420
#> 2    [1, 5)           1           5  2585680
#> 3   [5, 15)           5          15  7120500
#> 4  [15, 25)          15          25  6996800
#> 5  [25, 45)          25          45 15787900
#> 6  [45, 65)          45          65 15396800
#> 7 [65, Inf)          65         Inf 11063400