Bootstrapping standardized rates and decomposition effects

The basic premise of bootstrapping Das Gupta’s rate decomposition can be thought of as expanding out to unit-level data, re-sampling, then aggregating back up.

Data from Wang 2000:

library(tidyverse)
library(DasGuptR)

eg.wang2000 <- data.frame(
  pop = rep(c("male", "female"), e = 12),
  ethnicity = rep(1:3, e = 4),
  age_group = rep(1:4, 3),
  size = c(
    130, 1305, 1539, 316, 211, 697, 334, 48, 105, 475, 424, 49,
    70, 604, 428, 43, 55, 127, 44, 9, 72, 178, 103, 12
  ),
  rate = c(
    12.31, 34.90, 52.91, 44.44, 16.67, 36.40, 51.20, 41.67, 12.38, 19.20, 21.23, 12.50,
    17.14, 35.55, 48.71, 55.81, 14.55, 39.37, 32.56, 55.56, 22.22, 20.34, 13.59, 8.33
  )
)

To do this with the DasGuptR package, we must first create a function to essentially “uncount” data into individual level data (where nrow = size of total population), then re-sample with replacement, then aggregate back up to the level of the original data.

getBS <- function() {
  # expand out
  exp <- eg.wang2000 |>
    mutate(
      r1 = (rate / 100) * size,
      r0 = (1 - (rate / 100)) * size
    ) |>
    select(pop, ethnicity, age_group, r0:r1) |>
    pivot_longer(r0:r1, names_to = "r", values_to = "n") |>
    mutate(n = as.integer(n)) |>
    uncount(n)

  # sample for each pop
  bs <- lapply(unique(eg.wang2000$pop), \(p)
  slice_sample(exp[exp$pop == p, ],
    prop = 1, replace = TRUE
  ) |>
    group_by(ethnicity, age_group) |>
    reframe(
      pop = p,
      size = n(),
      rate = mean(r == "r1") * 100
    ) |>
    ungroup())

  do.call(rbind, bs)
}

Take 500 resamples:

bsamps <-
  tibble(
    k = 1:500,
    i = map(1:500, ~ getBS())
  )

and apply the standardisation to each resample:

bsamps <- bsamps |>
  mutate(
    dgo = map(i, ~ dgnpop(.,
      pop = "pop", factors = "rate",
      id_vars = c("ethnicity", "age_group"),
      crossclassified = "size"
    ))
  )

Here are the original difference effects:

dgnpop(eg.wang2000,
  pop = "pop", factors = "rate",
  id_vars = c("ethnicity", "age_group"),
  crossclassified = "size"
) |>
  dg_table()

                   female     male        diff decomp
age_group_struct 34.80709 36.73462  1.92753072  68.93
ethnicity_struct 35.79043 35.75128 -0.03914904  -1.40
rate             35.35659 36.26448  0.90788961  32.47
crude            34.59755 37.39382  2.79627129 100.00

And here are the standard errors:

bsamps |>
  select(k, dgo) |>
  unnest(dgo) |>
  group_by(k, factor) |>
  reframe(diff = rate[pop == "female"] - rate[pop == "male"]) |>
  group_by(factor) |>
  reframe(se = sd(diff))

# A tibble: 4 × 2
  factor              se
  <chr>            <dbl>
1 age_group_struct 0.313
2 crude            1.36 
3 ethnicity_struct 0.338
4 rate             1.34

Which we can take as a proportion of the crude rate difference:

Uncertainty in rates

We can use this same approach to estimate uncertainty in adjusted rates. For instance, using the Scottish Reconvictions data:

data(reconv)

src <- reconv |>
  transmute(
    pop = year, sex = Sex, age = Age,
    rate = prev_rate,
    size = offenders,
    r1 = rate * size,
    r0 = (1 - rate) * size
  )

getBS <- function() {
  # expand out
  exp <- src |>
    select(pop, sex, age, r0, r1) |>
    pivot_longer(r0:r1, names_to = "r", values_to = "n") |>
    mutate(n = as.integer(n)) |>
    uncount(n)

  # sample for each pop
  bs <- lapply(unique(src$pop), \(p)
  slice_sample(exp[exp$pop == p, ],
    prop = 1, replace = TRUE
  ) |>
    group_by(sex, age) |>
    reframe(
      pop = p,
      size = n(),
      rate = mean(r == "r1")
    ) |>
    ungroup())

  do.call(rbind, bs)
}


# 500 resamples
bsamps <-
  tibble(
    k = 1:500,
    i = map(1:500, ~ getBS())
  )

# apply DG on each resample
bsamps <- bsamps |>
  mutate(
    dgo = map(i, ~ dgnpop(.,
      pop = "pop", factors = "rate",
      id_vars = c("age", "sex"),
      crossclassified = "size"
    ))
  )