8.1 Workforce — Data Preparation

SIRS × DREAMS, HC/SIP parallel, canonical pipeline

Author

ARC Institute

Published

April 22, 2026

Code
suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(stringr)
  library(readr)
  library(readxl)
  library(purrr)
  library(stringdist)
  library(tibble)
})

# Absolute paths — qmd-location-independent (Quarto sets cwd to the qmd folder by default)
PROJECT_ROOT <- "/mnt/d/OneDrive - Yayasan Transformasi Kesejahteraan Bangsa/1. Project Center/ARC8. Global Surgery Institute/8.1 Workforce"
DATA_DIR     <- file.path(PROJECT_ROOT, "data")
OUTPUT_DIR   <- file.path(PROJECT_ROOT, "output")
if (!dir.exists(OUTPUT_DIR)) dir.create(OUTPUT_DIR, recursive = TRUE)

This document builds the canonical workforce dataset for Indonesia surgical care. HC and SIP are carried as parallel columns from load to final output — never collapsed.

1 Load raw data

1.1 SIRS

Code
hosp_list <- read.csv(file.path(DATA_DIR, "hospitals_national.csv"))
bed       <- read.csv(file.path(DATA_DIR, "hospitals_beds.csv"))
services  <- read.csv(file.path(DATA_DIR, "hospitals_services.csv"))
staff     <- read.csv(file.path(DATA_DIR, "hospitals_staff.csv"))

# Hospitals that are in the SIRS hospital registry but submitted no bed/service/staff
# data. These are retained for registry completeness but flagged as ineligible for the
# SAO analysis because workforce counts cannot be computed.
sirs_errors_path <- file.path(DATA_DIR, "hospitals_errors.csv")
if (file.exists(sirs_errors_path)) {
  sirs_empty <- read.csv(sirs_errors_path, colClasses = "character") %>%
    mutate(hospital_id = as.integer(hospital_id)) %>%
    select(hospital_id, hospital_name_empty = hospital_name, sirs_error = error)
} else {
  sirs_empty <- tibble::tibble(hospital_id = integer(0),
                                hospital_name_empty = character(0),
                                sirs_error = character(0))
}

cat("SIRS rows — hospitals:", nrow(hosp_list),
    " | beds:", nrow(bed),
    " | services:", nrow(services),
    " | staff:", nrow(staff),
    " | empty (no SIRS data):", nrow(sirs_empty), "\n")
SIRS rows — hospitals: 3301  | beds: 57815  | services: 178747  | staff: 168204  | empty (no SIRS data): 13 

1.2 DREAMS Headcount

Code
hc_files <- list.files(file.path(DATA_DIR, "DREAMS Headcount"),
                       pattern = "\\.xlsx$", full.names = TRUE)

hc_all <- map_dfr(hc_files, function(path) {
  read_excel(path, skip = 1) %>%
    mutate(
      source_file = basename(path),
      source_stub = str_remove(basename(path), "\\.xlsx$")
    )
}) %>%
  filter(`Tipe Faskes` == "Rumah Sakit")

cat("DREAMS HC rows:", nrow(hc_all),
    " | files:", length(hc_files),
    " | distinct hospitals:", n_distinct(hc_all$`Nama Faskes`), "\n")
DREAMS HC rows: 9544  | files: 21  | distinct hospitals: 2523 

1.3 DREAMS SIP

Code
sip_files <- list.files(file.path(DATA_DIR, "DREAMS SIP"),
                        pattern = "\\.xlsx$", full.names = TRUE)

sip_all <- map_dfr(sip_files, function(path) {
  read_excel(path, skip = 1) %>%
    mutate(
      source_file = basename(path),
      source_stub = str_remove(basename(path), "\\.xlsx$")
    )
}) %>%
  filter(`Tipe Faskes` == "Rumah Sakit")

cat("DREAMS SIP rows:", nrow(sip_all),
    " | files:", length(sip_files),
    " | distinct hospitals:", n_distinct(sip_all$`Nama Faskes`), "\n")
DREAMS SIP rows: 18622  | files: 21  | distinct hospitals: 3084 

1.4 BPJS population denominator

Loaded live from bpjs_data.reguler.member via Postgres. Weighted by PSTV15 (bobot), filtered to member_status IN ('Active','Inactive') (excludes deceased), member_year = '2024'. Kab codes reconciled against the BPS shapefile.

35 BPJS kab codes use pre-2022 legacy values (mostly Papua splits) that don’t map to current BPS names without an authoritative Kemendagri lookup. Those ~7M pop are documented as gaps rather than guessed.

Code
suppressPackageStartupMessages({
  library(DBI); library(RPostgres); library(sf)
})

con <- dbConnect(
  Postgres(),
  host     = "localhost",
  port     = 5432,
  dbname   = "bpjs_data",
  user     = "julianbs",
  password = "123123"
)

denom_raw <- dbGetQuery(con, "
  SELECT
    \"PSTV10\"::int AS kab_code,
    COUNT(*)        AS n_sample,
    SUM(\"PSTV15\")  AS pop
  FROM reguler.member
  WHERE member_year   = '2024'
    AND member_status IN ('Active','Inactive')
    AND \"PSTV10\" IS NOT NULL
  GROUP BY 1;
")

cat("BPJS kab rows:", nrow(denom_raw),
    " | weighted pop (M):", round(sum(denom_raw$pop) / 1e6, 1), "\n")
BPJS kab rows: 515  | weighted pop (M): 291.6 

2 Prepare SIRS

2.1 Bed aggregation

Bellwether bed classes: Kelas I, Kelas II, Kelas III, KRIS JKN.

Code
bed_clean <- bed %>%
  mutate(count_num = suppressWarnings(as.integer(count))) %>%
  filter(!is.na(count_num))

hosp_bed_agg <- bed_clean %>%
  mutate(
    count    = as.integer(count),
    bed_flag = str_detect(
      bed_class,
      regex("kelas\\s*i\\b|kelas\\s*ii\\b|kelas\\s*iii\\b|kris\\s*jkn", ignore_case = TRUE)
    )
  ) %>%
  filter(bed_flag) %>%
  group_by(hospital_id) %>%
  summarise(
    bed_bellwether = 1L,
    bed_match_n    = n(),
    bed_total      = sum(count, na.rm = TRUE),
    .groups = "drop"
  )

2.2 Staff classification

Code
rx <- list(
  anesthesia_group   = regex("anestesi|anastesi|anesthesi|terapi intensif|intensive care|critical care", ignore_case = TRUE),
  anesthesia_type    = regex("sp\\.an(\\)|\\b)|anestesi|anastesi|anesthesi|intensive care|critical care|terapi intensif", ignore_case = TRUE),
  anesthesia_exclude = regex("andrologi|sp\\.and|penata anestesi", ignore_case = TRUE),

  obgyn_group = regex("obstetri|ginekologi|obgin|obgyn", ignore_case = TRUE),
  obgyn_type  = regex("sp\\.og|obstetri\\s*&\\s*ginekologi|obstetri\\s+dan\\s+ginekologi|ginekologi", ignore_case = TRUE),

  ortho_group   = regex("orthopaedi|orthopedi|orthopedic|orthopaedic|traumatologi", ignore_case = TRUE),
  ortho_type    = regex("sp\\.ot|orthopaedi|orthopedi|orthopedic|orthopaedic|traumatologi", ignore_case = TRUE),
  ortho_exclude = regex("orthodont|kawat gigi|gigi", ignore_case = TRUE),

  bedum_group   = regex("^subspesialis\\s+bedah|subspesialis\\s+bedah\\s+dan/atau|subspesialis bedah", ignore_case = TRUE),
  bedum_type    = regex("sp\\.b(\\)|\\b)|dokter spesialis bedah", ignore_case = TRUE),
  bedum_exclude = regex("bedah mulut|maksilofasial|bedah anak|vaskular|plastik|syaraf|saraf|thoraks|kardiovask|jantung|onkologi|digestif", ignore_case = TRUE)
)

hosp_staff_agg <- staff %>%
  filter(count > 0) %>%
  mutate(
    specialty_family = case_when(
      (str_detect(staff_group, rx$anesthesia_group) | str_detect(staff_type, rx$anesthesia_type)) &
        !str_detect(staff_group, rx$anesthesia_exclude) &
        !str_detect(staff_type,  rx$anesthesia_exclude) ~ "anesthesia",
      str_detect(staff_group, rx$obgyn_group) | str_detect(staff_type, rx$obgyn_type) ~ "obgyn",
      (str_detect(staff_group, rx$ortho_group) | str_detect(staff_type, rx$ortho_type)) &
        !str_detect(staff_group, rx$ortho_exclude) &
        !str_detect(staff_type,  rx$ortho_exclude) ~ "ortho",
      (str_detect(staff_group, rx$bedum_group) | str_detect(staff_type, rx$bedum_type)) &
        !str_detect(staff_group, rx$bedum_exclude) &
        !str_detect(staff_type,  rx$bedum_exclude) ~ "bedum",
      TRUE ~ NA_character_
    ),
    specialty_level = case_when(
      is.na(specialty_family) ~ NA_character_,
      str_detect(staff_group, regex("subspesialis", ignore_case = TRUE)) ~ "subspecialist",
      TRUE ~ "specialist"
    )
  ) %>%
  filter(!is.na(specialty_family)) %>%
  group_by(hospital_id, specialty_family, specialty_level) %>%
  summarise(count = sum(count, na.rm = TRUE), .groups = "drop") %>%
  unite("staff_col", specialty_family, specialty_level) %>%
  pivot_wider(names_from = staff_col, values_from = count, values_fill = 0L)

2.3 Combine SIRS into hosp_sirs

Code
hosp_sirs <- hosp_list %>%
  left_join(hosp_staff_agg, by = "hospital_id") %>%
  left_join(hosp_bed_agg,   by = "hospital_id") %>%
  mutate(across(
    any_of(c("anesthesia_specialist", "anesthesia_subspecialist",
             "bedum_specialist",      "bedum_subspecialist",
             "obgyn_specialist",      "obgyn_subspecialist",
             "ortho_specialist",      "ortho_subspecialist")),
    ~ replace_na(., 0L)
  )) %>%
  mutate(
    bed_bellwether = replace_na(bed_bellwether, 0L),
    bed_match_n    = replace_na(bed_match_n,    0L),
    bed_total      = replace_na(bed_total,      0L)
  ) %>%
  rename(
    sirs_anes_spec_n     = anesthesia_specialist,
    sirs_anes_subspec_n  = anesthesia_subspecialist,
    sirs_surg_spec_n     = bedum_specialist,
    sirs_surg_subspec_n  = bedum_subspecialist,
    sirs_obgyn_spec_n    = obgyn_specialist,
    sirs_obgyn_subspec_n = obgyn_subspecialist,
    sirs_ortho_spec_n    = ortho_specialist,
    sirs_ortho_subspec_n = ortho_subspecialist
  ) %>%
  mutate(
    sirs_anes_n  = sirs_anes_spec_n  + sirs_anes_subspec_n,
    sirs_surg_n  = sirs_surg_spec_n  + sirs_surg_subspec_n,
    sirs_obgyn_n = sirs_obgyn_spec_n + sirs_obgyn_subspec_n,
    sirs_ortho_n = sirs_ortho_spec_n + sirs_ortho_subspec_n,
    bed_present  = factor(if_else(bed_bellwether > 0L, "Yes", "No"), levels = c("Yes","No"))
  )

# Tag hospitals that have no SIRS data submitted (scraper found empty tables)
hosp_sirs <- hosp_sirs %>%
  left_join(sirs_empty, by = "hospital_id") %>%
  mutate(
    sirs_empty = !is.na(sirs_error),
    sirs_error = replace_na(sirs_error, NA_character_)
  ) %>%
  select(-hospital_name_empty)

cat("hosp_sirs rows:", nrow(hosp_sirs), "\n")
hosp_sirs rows: 3301 

3 Prepare DREAMS

3.1 SDMK → four-bucket lookup

Code
sdmk_lookup <- function(df) {
  df %>%
    mutate(
      sdmk_group = case_when(
        SDMK %in% c(
          "Dokter Spesialis Anastesiologi (Sp.An)",
          "Dokter Spesialis Anastesi - Konsultan Neuro Anastesi (AN-KNA)",
          "Subspesialis Terapi Intensif/Intensive Care",
          "Subspesialis Intensive Care",
          "Subspesialis Anestesi Kardiovaskular dan Critical Care",
          "Subspesialis Manajemen Nyeri",
          "Subspesialis Neuroanestesi dan Critical Care",
          "Subspesialis Anestesi Obstetri dan Critical Care",
          "Subspesialis Anestesi Pediatrik dan Critical Care",
          "Subspesialis Anestesi Regional"
        ) ~ "anes",

        SDMK %in% c(
          "Dokter Spesialis Bedah (Sp.B)",
          "Subspesialis Digestif",
          "Subspesialis Onkologi",
          "Subspesialis Bedah Kepala Leher"
        ) ~ "surg",

        SDMK %in% c(
          "Dokter Spesialis Obstetri & Ginekologi (Sp.OG)",
          "Subspesialis Feto Maternal",
          "Subspesialis Fertilitas dan endokrinologi reproduksi",
          "Subspesialis Obstetri ginekologi sosial",
          "Subspesialis Onkologi ginekologi",
          "Subspesialis Uroginekologi dan rekonstruksi"
        ) ~ "obgyn",

        SDMK %in% c(
          "Dokter Spesialis Orthopedi & Traumatologi (Sp.OT)",
          "Dokter Spesialis Bedah Orthopedi",
          "Subspesialis  Hip and Knee",
          "Subspesialis Orthopedic Spine",
          "Subspesialis Orthopaedic Sports Injury",
          "Subspesialis Hand, Upper Limb and Microsurgery",
          "Subspesialis Orthopedic Oncology",
          "Subspesialis Pediatric Orthopaedic",
          "Subspesialis Foot and Ankle",
          "Subspesialis Shoulder and Elbow",
          "Subspesialis  Advance Orthopaedic Trauma"
        ) ~ "ortho",

        TRUE ~ NA_character_
      )
    )
}

3.2 Specialty mapping table

Code
hc_mapping <- hc_all %>%
  sdmk_lookup() %>%
  group_by(bucket = sdmk_group, source = "dreams_hc",
           source_file_or_type = source_stub, sdmk_label = SDMK) %>%
  summarise(n_rows = n(), n_staff = sum(Jumlah, na.rm = TRUE), .groups = "drop")

sip_mapping <- sip_all %>%
  sdmk_lookup() %>%
  group_by(bucket = sdmk_group, source = "dreams_sip",
           source_file_or_type = source_stub, sdmk_label = SDMK) %>%
  summarise(n_rows = n(), n_staff = sum(Jumlah, na.rm = TRUE), .groups = "drop")

sirs_mapping <- staff %>%
  filter(count > 0) %>%
  mutate(
    bucket = case_when(
      (str_detect(staff_group, rx$anesthesia_group) | str_detect(staff_type, rx$anesthesia_type)) &
        !str_detect(staff_group, rx$anesthesia_exclude) &
        !str_detect(staff_type,  rx$anesthesia_exclude) ~ "anes",
      str_detect(staff_group, rx$obgyn_group) | str_detect(staff_type, rx$obgyn_type) ~ "obgyn",
      (str_detect(staff_group, rx$ortho_group) | str_detect(staff_type, rx$ortho_type)) &
        !str_detect(staff_group, rx$ortho_exclude) &
        !str_detect(staff_type,  rx$ortho_exclude) ~ "ortho",
      (str_detect(staff_group, rx$bedum_group) | str_detect(staff_type, rx$bedum_type)) &
        !str_detect(staff_group, rx$bedum_exclude) &
        !str_detect(staff_type,  rx$bedum_exclude) ~ "surg",
      TRUE ~ NA_character_
    ),
    level = if_else(str_detect(staff_group, regex("subspesialis", ignore_case = TRUE)),
                    "subspecialist", "specialist")
  ) %>%
  filter(!is.na(bucket)) %>%
  group_by(bucket, source = "sirs", level,
           source_file_or_type = staff_type, sdmk_label = staff_type) %>%
  summarise(n_rows = n(), n_staff = sum(count, na.rm = TRUE), .groups = "drop")

specialty_mapping <- bind_rows(
  hc_mapping  %>% mutate(level = NA_character_),
  sip_mapping %>% mutate(level = NA_character_),
  sirs_mapping
) %>%
  select(bucket, source, level, source_file_or_type, sdmk_label, n_rows, n_staff) %>%
  arrange(bucket, source, desc(n_staff))

cat("specialty_mapping rows:", nrow(specialty_mapping), "\n")
specialty_mapping rows: 70 
Code
specialty_mapping %>% count(bucket, source) %>% pivot_wider(names_from = source, values_from = n) %>% print()
# A tibble: 4 × 4
  bucket dreams_hc dreams_sip  sirs
  <chr>      <int>      <int> <int>
1 anes           2          2     9
2 obgyn          6          6     6
3 ortho         10         10    11
4 surg           3          3     2

3.3 Aggregate HC and SIP per hospital × specialty

Code
hc_agg <- hc_all %>%
  sdmk_lookup() %>%
  filter(!is.na(sdmk_group)) %>%
  group_by(Propinsi, Kabupaten, `Nama Faskes`, `Tipe Faskes`, sdmk_group) %>%
  summarise(hc_n = sum(Jumlah, na.rm = TRUE), .groups = "drop")

sip_agg <- sip_all %>%
  sdmk_lookup() %>%
  filter(!is.na(sdmk_group)) %>%
  group_by(Propinsi, Kabupaten, `Nama Faskes`, `Tipe Faskes`, sdmk_group) %>%
  summarise(sip_n = sum(Jumlah, na.rm = TRUE), .groups = "drop")

cat("HC rows:", nrow(hc_agg), " | SIP rows:", nrow(sip_agg), "\n")
HC rows: 6243  | SIP rows: 10378 

3.4 Combine into dreams_wide

Code
dreams_long <- full_join(
  hc_agg, sip_agg,
  by = c("Propinsi", "Kabupaten", "Nama Faskes", "Tipe Faskes", "sdmk_group")
) %>%
  mutate(hc_n = coalesce(hc_n, 0L), sip_n = coalesce(sip_n, 0L))

dreams_wide <- dreams_long %>%
  select(Propinsi, Kabupaten, `Nama Faskes`, `Tipe Faskes`,
         sdmk_group, hc_n, sip_n) %>%
  pivot_wider(
    names_from  = sdmk_group,
    values_from = c(hc_n, sip_n),
    values_fill = list(hc_n = 0L, sip_n = 0L)
  ) %>%
  rename(
    hc_anes_n   = hc_n_anes,
    hc_surg_n   = hc_n_surg,
    hc_obgyn_n  = hc_n_obgyn,
    hc_ortho_n  = hc_n_ortho,
    sip_anes_n  = sip_n_anes,
    sip_surg_n  = sip_n_surg,
    sip_obgyn_n = sip_n_obgyn,
    sip_ortho_n = sip_n_ortho
  )

cat("DREAMS hospitals (HC ∪ SIP):", nrow(dreams_wide), "\n")
DREAMS hospitals (HC ∪ SIP): 3149 

4 Match DREAMS → SIRS

4.1 Normalisation functions

Code
# --- Generic name cleaning ---------------------------------------------------
norm_region <- function(x) {
  x |> as.character() |> str_to_upper() |>
    str_replace_all("[^A-Z0-9]", " ") |>
    str_replace_all("\\s+", " ") |> str_trim()
}

# --- Facility name normalization ---------------------------------------------
norm_faskes <- function(x) {
  x |>
    str_to_upper() |>
    str_replace_all("[^A-Z0-9 ]", " ") |>
    str_squish() |>
    str_replace("^RUMAH SAKIT UMUM DAERAH ", "RSUD ") |>
    str_replace("^RS UMUM DAERAH ",          "RSUD ") |>
    str_replace("^RUMAH SAKIT UMUM PUSAT ",  "RSUP ") |>
    str_replace("^RS UMUM PUSAT ",           "RSUP ") |>
    str_replace("^RUMAH SAKIT UMUM ",        "RSU ")  |>
    str_replace("^RUMAH SAKIT ",             "RS ")   |>
    str_replace("^RS U ",                    "RSU ")  |>
    str_replace_all("\\bDRS\\b",      " ") |>
    str_replace_all("\\bDR\\b",       " ") |>
    str_replace_all("\\bH\\b",        " ") |>
    str_replace_all("\\bHJ\\b",       " ") |>
    str_replace_all("\\bHJH\\b",      " ") |>
    str_replace_all("\\bPROF\\b",     " ") |>
    str_replace_all("\\bPROFESOR\\b", " ") |>
    str_replace_all("\\bIR\\b",       " ") |>
    str_squish()
}

# --- Province normalization --------------------------------------------------
fix_province <- function(x) {
  x <- str_to_upper(str_squish(x))
  x[x == "JAKARTA"]                    <- "DKI JAKARTA"
  x[x == "BANGKA BELITUNG"]            <- "KEPULAUAN BANGKA BELITUNG"
  x[x == "KEP BANGKA BELITUNG"]        <- "KEPULAUAN BANGKA BELITUNG"
  x[x == "KEP. BANGKA BELITUNG"]       <- "KEPULAUAN BANGKA BELITUNG"
  x[x == "YOGYAKARTA"]                 <- "DI YOGYAKARTA"
  x[x == "D.I. YOGYAKARTA"]            <- "DI YOGYAKARTA"
  x[x == "DAERAH ISTIMEWA YOGYAKARTA"] <- "DI YOGYAKARTA"
  # Collapse post-2022 Papua splits back to pre-split for SIRS/DREAMS compat
  x[x == "PAPUA BARAT DAYA"]           <- "PAPUA BARAT"
  x[x == "PAPUA SELATAN"]              <- "PAPUA"
  x[x == "PAPUA TENGAH"]               <- "PAPUA"
  x[x == "PAPUA PEGUNUNGAN"]           <- "PAPUA"
  x
}

# --- Kabupaten alias table ---------------------------------------------------
# Canonical form = AUTHORITATIVE Kemendagri Permendagri 137/2017 name.
# Variants come from SIRS and DREAMS free-text regency strings and must map
# to the exact `kab_name` that appears in auth_kab_lookup.rds.
#
# Only entries where SIRS/DREAMS differ from auth belong here. If a source
# string already normalizes to the auth name, NO alias is needed.
kab_aliases <- tibble::tribble(
  ~variant,                              ~canonical,
  # DKI: SIRS uses 'Kota Jakarta X', auth is 'KOTA JAKARTA X' — already matches.
  # No alias needed for DKI.

  # Name variants where SIRS/DREAMS differ from auth form:
  "KOTA PADANG SIDEMPUAN",               "KOTA PADANGSIDIMPUAN",
  "KOTA DUMAI",                          "KOTA D U M A I",
  "SIAK",                                "S I A K",
  "KOTA BATAM",                          "KOTA B A T A M",
  "BANYUASIN",                           "BANYU ASIN",
  "MUKO MUKO",                           "MUKOMUKO",
  "TULANG BAWANG",                       "TULANGBAWANG",
  "KARANGASEM",                          "KARANG ASEM",
  "KEPULAUAN SIAU TAGULANDANG BIARO",    "SIAU TAGULANDANG BIARO",
  "KEP SIAU TAGULANDANG BIARO",          "SIAU TAGULANDANG BIARO",
  "PANGKAJENE KEPULAUAN",                "PANGKAJENE DAN KEPULAUAN",
  "KOTA PARE PARE",                      "KOTA PAREPARE",
  "KOTA BAU BAU",                        "KOTA BAUBAU",
  "PASANGKAYU",                          "MAMUJU UTARA",
  "KEPULAUAN TANIMBAR",                  "MALUKU TENGGARA BARAT",

  # Hyphen vs space variants
  "TOLI TOLI",                           "TOLI-TOLI",
  "TOJO UNA UNA",                        "TOJO UNA-UNA",

  # Other common legacy variants from Sumut Lubuklinggau which isn't a
  # name issue but a frequent typo we've seen
  "KOTA LUBUK LINGGAU",                  "KOTA LUBUKLINGGAU"
)

# --- Canonicalize any kab name via the alias table ---------------------------
canonicalize_kab <- function(x, alias_tbl = kab_aliases) {
  x_norm <- norm_region(x)
  idx <- match(x_norm, norm_region(alias_tbl$variant))
  ifelse(is.na(idx), x_norm, norm_region(alias_tbl$canonical[idx]))
}

fix_kab <- function(x) canonicalize_kab(x)

4.2 4-way crosswalk: SIRS × DREAMS × SHP × BPJS

Authoritative kab names from Kemendagri Permendagri 137/2017 (via auth_kab_lookup.rds). 514 kabs, one row each, keyed on kab_name (canonical). Every data source joins via name (not code), because the BPS shapefile uses pre-revision codes for several kabs (notably Kaltim’s 2012 Kaltara split) which would otherwise cause silent population misassignment.

Code
# Load authoritative kab lookup (514 rows: bps_code, kab_name, prov_name, ...)
auth_kab <- readRDS(file.path(OUTPUT_DIR, "auth_kab_lookup.rds"))
auth_to_shp_alias <- readRDS(file.path(OUTPUT_DIR, "auth_to_shp_alias.rds"))

# Attach shapefile kab_code to each authoritative row via name (direct or alias).
suppressPackageStartupMessages(library(sf))
SHP_DIR  <- "/mnt/d/OneDrive - Yayasan Transformasi Kesejahteraan Bangsa/1. Project Center/File SHP"
shp_raw  <- st_read(file.path(SHP_DIR, "kab_kota", "kab.shp"), quiet = TRUE) |> st_drop_geometry()
shp_lookup <- shp_raw |>
  transmute(
    shp_kab_code  = as.integer(KODE_KAB_S),
    shp_prov_code = as.integer(KODE_PROP_),
    shp_kab_name  = toupper(as.character(NAMA_KAB)),
    shp_prov_name = toupper(as.character(NAMA_PROP))
  )

# Canonical base: every row keyed on kab_name (authoritative, unique)
canonical <- auth_kab |>
  left_join(auth_to_shp_alias |> select(auth_name, shp_name),
            by = c("kab_name" = "auth_name")) |>
  mutate(shp_name = coalesce(shp_name, kab_name)) |>
  left_join(shp_lookup, by = c("shp_name" = "shp_kab_name")) |>
  transmute(
    # Authoritative identity
    bps_code           = bps_code,
    kab_name_canonical = kab_name,
    # prov_name_canonical collapses Papua post-2022 splits back to pre-split names
    # for pipeline-wide consistency with fix_province().
    # prov_name_new preserves the post-2022 split names (for analyses that need them).
    prov_name_canonical= fix_province(prov_name),
    prov_name_new      = prov_name,
    prov_code_legacy   = prov_code_legacy,
    prov_name_legacy   = toupper(prov_name_legacy),
    # Shapefile linkage (for geometry joins downstream)
    shp_kab_code,
    shp_prov_code,
    shp_kab_name       = shp_name,
    shp_prov_name,
    # Normalized key for matching hospital/DREAMS strings
    key_norm           = norm_region(kab_name)
  )

stopifnot(nrow(canonical) == 514, sum(is.na(canonical$shp_kab_code)) == 0)
cat("Canonical base rows:", nrow(canonical), "\n")
Canonical base rows: 514 
Code
# Match SIRS regency strings (already canonicalized via fix_kab/fix_province)
# to authoritative names by normalized form.
sirs_map <- hosp_list |>
  distinct(sirs_province_raw = province, sirs_regency_raw = regency) |>
  mutate(key_norm = canonicalize_kab(sirs_regency_raw)) |>
  left_join(canonical |> select(bps_code, key_norm_canon = key_norm),
            by = c("key_norm" = "key_norm_canon")) |>
  distinct(bps_code, sirs_province_raw, sirs_regency_raw) |>
  filter(!is.na(bps_code))

cat("SIRS matched to canonical:", nrow(sirs_map), "\n")
SIRS matched to canonical: 513 
Code
dreams_map <- bind_rows(
  hc_all  |> distinct(dreams_propinsi = Propinsi, dreams_kabupaten = Kabupaten),
  sip_all |> distinct(dreams_propinsi = Propinsi, dreams_kabupaten = Kabupaten)
) |>
  distinct() |>
  mutate(key_norm = canonicalize_kab(dreams_kabupaten)) |>
  left_join(canonical |> select(bps_code, key_norm_canon = key_norm),
            by = c("key_norm" = "key_norm_canon")) |>
  distinct(bps_code, dreams_propinsi, dreams_kabupaten) |>
  filter(!is.na(bps_code))

cat("DREAMS matched to canonical:", nrow(dreams_map), "\n")
DREAMS matched to canonical: 501 
Code
# BPJS kab codes (PSTV10) are identical to authoritative Permendagri codes (verified 514/514).
# So the BPJS→canonical join is a clean merge on bps_code. No code remapping needed.

if (!exists("con") || !DBI::dbIsValid(con)) {
  con <- dbConnect(
    Postgres(), host = "localhost", port = 5432,
    dbname = "bpjs_data", user = "julianbs", password = "123123"
  )
}

bpjs_raw_codes <- dbGetQuery(con, "
  SELECT DISTINCT
    \"PSTV09\"::int     AS pstv09,
    \"PSTV09_NEW\"::int AS pstv09_new,
    \"PSTV10\"::int     AS pstv10
  FROM reguler.member
  WHERE \"PSTV10\" IS NOT NULL AND \"PSTV10\" != 9999
    AND member_year = '2024';
")

bpjs_map <- bpjs_raw_codes |>
  inner_join(canonical |> select(bps_code), by = c("pstv10" = "bps_code")) |>
  rename(bpjs_pstv09 = pstv09, bpjs_pstv09_new = pstv09_new, bpjs_pstv10 = pstv10)

cat("BPJS matched to canonical:", nrow(bpjs_map), "/", nrow(bpjs_raw_codes), "\n")
BPJS matched to canonical: 514 / 514 
Code
kab_crosswalk <- canonical |>
  left_join(
    sirs_map |> group_by(bps_code) |>
      summarise(
        sirs_regency_raw  = paste(unique(sirs_regency_raw),  collapse = " | "),
        sirs_province_raw = paste(unique(sirs_province_raw), collapse = " | "),
        .groups = "drop"),
    by = "bps_code") |>
  left_join(
    dreams_map |> group_by(bps_code) |>
      summarise(
        dreams_kabupaten_raw = paste(unique(dreams_kabupaten), collapse = " | "),
        dreams_propinsi_raw  = paste(unique(dreams_propinsi),  collapse = " | "),
        .groups = "drop"),
    by = "bps_code") |>
  left_join(
    bpjs_map |> rename(bps_code = bpjs_pstv10) |>
      group_by(bps_code) |>
      summarise(bpjs_pstv09 = first(bpjs_pstv09),
                bpjs_pstv09_new = first(bpjs_pstv09_new),
                .groups = "drop") |>
      mutate(bpjs_pstv10 = bps_code),
    by = "bps_code")

cat("Crosswalk rows:", nrow(kab_crosswalk), "\n")
Crosswalk rows: 514 
Code
cat("Match coverage:\n")
Match coverage:
Code
tibble(
  source  = c("SIRS", "DREAMS", "BPJS"),
  matched = c(sum(!is.na(kab_crosswalk$sirs_regency_raw)),
              sum(!is.na(kab_crosswalk$dreams_kabupaten_raw)),
              sum(!is.na(kab_crosswalk$bpjs_pstv10)))
) |> print()
# A tibble: 3 × 2
  source matched
  <chr>    <int>
1 SIRS       513
2 DREAMS     500
3 BPJS       514
Code
saveRDS(kab_crosswalk, file.path(OUTPUT_DIR, "kab_crosswalk.rds"))
cat("\nSaved:", file.path(OUTPUT_DIR, "kab_crosswalk.rds"), "\n")

Saved: /mnt/d/OneDrive - Yayasan Transformasi Kesejahteraan Bangsa/1. Project Center/ARC8. Global Surgery Institute/8.1 Workforce/output/kab_crosswalk.rds 

4.3 BPJS denominator — join via authoritative BPS code

Code
denom_clean <- denom_raw |>
  filter(kab_code != 9999) |>
  inner_join(kab_crosswalk |> select(bps_code, kab_name_canonical, prov_name_canonical),
             by = c("kab_code" = "bps_code")) |>
  transmute(
    province_key = prov_name_canonical,
    regency_key  = kab_name_canonical,
    pop
  ) |>
  group_by(province_key, regency_key) |>
  summarise(pop = sum(pop, na.rm = TRUE), .groups = "drop")

cat("Denominator rows:", nrow(denom_clean),
    " | total pop (M):", round(sum(denom_clean$pop) / 1e6, 1), "\n")
Denominator rows: 514  | total pop (M): 284.5 

4.4 Build matching keys

Code
dreams_keys <- dreams_wide %>%
  filter(`Tipe Faskes` == "Rumah Sakit") %>%
  mutate(
    province_key = fix_province(Propinsi),
    regency_key  = fix_kab(Kabupaten),
    facility_key = norm_faskes(`Nama Faskes`)
  )

sirs_keys <- hosp_sirs %>%
  mutate(
    province_key = fix_province(province),
    regency_key  = fix_kab(regency),
    facility_key = norm_faskes(hospital_name)
  )

4.5 Exact match

Code
exact_matched <- dreams_keys %>%
  inner_join(
    sirs_keys %>% select(hospital_id, hospital_name, province_key, regency_key, facility_key),
    by = c("province_key", "regency_key", "facility_key")
  ) %>%
  mutate(match_type = "exact")

dreams_unmatched <- dreams_keys %>%
  anti_join(sirs_keys, by = c("province_key", "regency_key", "facility_key"))

sirs_unmatched <- sirs_keys %>%
  anti_join(dreams_keys, by = c("province_key", "regency_key", "facility_key"))

cat("Exact:", nrow(exact_matched),
    " | DREAMS unmatched:", nrow(dreams_unmatched),
    " | SIRS unmatched:", nrow(sirs_unmatched), "\n")
Exact: 3090  | DREAMS unmatched: 60  | SIRS unmatched: 212 

4.6 Fuzzy match (within province + regency)

Code
fuzzy_candidates <- dreams_unmatched %>%
  inner_join(
    sirs_unmatched,
    by     = c("province_key", "regency_key"),
    suffix = c("_dreams", "_sirs"),
    relationship = "many-to-many"
  ) %>%
  mutate(
    dist_lv = stringdist(facility_key_dreams, facility_key_sirs, method = "lv"),
    dist_jw = stringdist(facility_key_dreams, facility_key_sirs, method = "jw")
  ) %>%
  group_by(province_key, regency_key, facility_key_dreams) %>%
  arrange(dist_lv, dist_jw, .by_group = TRUE) %>%
  slice(1) %>% ungroup()

fuzzy_accepted <- fuzzy_candidates %>%
  filter(dist_lv <= 5 & dist_jw <= 0.22) %>%
  mutate(match_type = "fuzzy")

cat("Fuzzy accepted:", nrow(fuzzy_accepted), "\n")
Fuzzy accepted: 5 

4.7 Manual overrides + SIRS placeholders

Code
manual_name_fixes <- tribble(
  ~facility_key_dreams,                         ~province_key,          ~regency_key,             ~hospital_id, ~override_reason,
  "RS INFEKSI WANGSA AVATARA",                  "DKI JAKARTA",          "KOTA JAKARTA SELATAN",   3171918L, "COVID 'INFEKSI' prefix added in DREAMS; SIRS: RS Wangsa Avatara",
  "RSU CAHAYA HUSADA",                          "ACEH",                 "NAGAN RAYA",             1115023L, "SIRS legacy name includes 'SWASTA'; dropped after reclassification",
  "RS TK II 17 05 01 MARTHEN INDEY JAYAPURA",   "PAPUA",                "KOTA JAYAPURA",          9203016L, "DREAMS uses military code Tk.II; SIRS: RS TNI AD Marthen Indey",
  "RS SADEWA",                                  "DI YOGYAKARTA",        "SLEMAN",                 3404187L, "DREAMS short name; SIRS: RS Khusus Ibu Anak Sadewa",
  "RSUD KABUPATEN ROTE NDAO",                   "NUSA TENGGARA TIMUR",  "ROTE NDAO",              5303024L, "SIRS appends city suffix 'Ba'a'; DREAMS uses 'Kabupaten'"
)

sirs_placeholders <- tribble(
  ~facility_key_dreams,   ~province_key,      ~regency_key,      ~hospital_id, ~hospital_name,                 ~placeholder_note,
  "RS GRIYA WALUYA",      "JAWA TIMUR",       "PONOROGO",        3502145L, "RS Griya Waluya",              "Confirmed at sirs.kemkes.go.id",
  "RS HAJI DARJAD",       "KALIMANTAN TIMUR", "KOTA SAMARINDA",  6472107L, "RS Haji Darjad",               "Confirmed at sirs.kemkes.go.id",
  "RS UMUM MOJOSONGO 2",  "JAWA TENGAH",      "KARANGANYAR",     3313056L, "RS Khusus Bedah Mojosongo 2",  "DREAMS uses 'Umum', SIRS uses 'Khusus Bedah'"
)

manual_map <- manual_name_fixes %>%
  left_join(sirs_keys %>% select(hospital_id, hospital_name), by = "hospital_id") %>%
  transmute(
    province_key, regency_key, facility_key_dreams,
    matched_hospital_id   = hospital_id,
    matched_hospital_name = hospital_name,
    match_type = "manual_override"
  )

placeholder_map <- sirs_placeholders %>%
  transmute(
    province_key, regency_key, facility_key_dreams,
    matched_hospital_id   = hospital_id,
    matched_hospital_name = hospital_name,
    match_type = "placeholder_sirs_id"
  )

4.8 Combined match map

Code
exact_map <- exact_matched %>%
  transmute(
    province_key, regency_key,
    facility_key_dreams   = facility_key,
    matched_hospital_id   = hospital_id,
    matched_hospital_name = hospital_name,
    match_type
  )

fuzzy_map <- fuzzy_accepted %>%
  transmute(
    province_key, regency_key, facility_key_dreams,
    matched_hospital_id   = hospital_id,
    matched_hospital_name = hospital_name,
    match_type
  )

match_priority <- c(exact = 1L, fuzzy = 2L, manual_override = 3L, placeholder_sirs_id = 4L)

combined_match_map <- bind_rows(exact_map, fuzzy_map, manual_map, placeholder_map) %>%
  mutate(match_rank = match_priority[match_type]) %>%
  arrange(province_key, regency_key, facility_key_dreams, match_rank) %>%
  group_by(province_key, regency_key, facility_key_dreams) %>%
  slice(1) %>% ungroup() %>%
  select(-match_rank)

cat("Match map entries:\n")
Match map entries:
Code
combined_match_map %>% count(match_type) %>% print()
# A tibble: 4 × 2
  match_type              n
  <chr>               <int>
1 exact                3088
2 fuzzy                   5
3 manual_override         3
4 placeholder_sirs_id     3

5 Build hospital-level dataset

5.1 Attach match results, join SIRS attributes

Code
dreams_with_match <- dreams_keys %>%
  left_join(
    combined_match_map,
    by = c("province_key", "regency_key", "facility_key" = "facility_key_dreams")
  ) %>%
  mutate(match_type = replace_na(match_type, "dreams_unmatched"))

dreams_joined <- dreams_with_match %>%
  left_join(
    hosp_sirs %>%
      select(
        hospital_id, ownership, class,
        sirs_anes_spec_n, sirs_anes_subspec_n, sirs_anes_n,
        sirs_surg_spec_n, sirs_surg_subspec_n, sirs_surg_n,
        sirs_obgyn_spec_n, sirs_obgyn_subspec_n, sirs_obgyn_n,
        sirs_ortho_spec_n, sirs_ortho_subspec_n, sirs_ortho_n,
        bed_bellwether, bed_match_n, bed_total, bed_present
      ),
    by = c("matched_hospital_id" = "hospital_id")
  )

cat("dreams_joined rows:", nrow(dreams_joined), "\n")
dreams_joined rows: 3149 

5.2 SIRS-only hospitals

Code
sirs_only_ids <- sirs_keys %>%
  anti_join(combined_match_map, by = c("hospital_id" = "matched_hospital_id")) %>%
  pull(hospital_id)

sirs_only_rows <- hosp_sirs %>%
  filter(hospital_id %in% sirs_only_ids) %>%
  transmute(
    matched_hospital_id   = hospital_id,
    matched_hospital_name = hospital_name,
    match_type            = "sirs_only",
    Propinsi  = NA_character_, Kabupaten = NA_character_,
    `Nama Faskes` = NA_character_, `Tipe Faskes` = NA_character_,
    province_key = fix_province(province),
    regency_key  = fix_kab(regency),
    facility_key = norm_faskes(hospital_name),
    hc_anes_n  = NA_real_, hc_surg_n  = NA_real_, hc_obgyn_n = NA_real_, hc_ortho_n = NA_real_,
    sip_anes_n = NA_real_, sip_surg_n = NA_real_, sip_obgyn_n = NA_real_, sip_ortho_n = NA_real_,
    ownership, class,
    sirs_anes_spec_n, sirs_anes_subspec_n, sirs_anes_n,
    sirs_surg_spec_n, sirs_surg_subspec_n, sirs_surg_n,
    sirs_obgyn_spec_n, sirs_obgyn_subspec_n, sirs_obgyn_n,
    sirs_ortho_spec_n, sirs_ortho_subspec_n, sirs_ortho_n,
    bed_bellwether, bed_match_n, bed_total, bed_present
  )

cat("SIRS-only hospitals:", nrow(sirs_only_rows), "\n")
SIRS-only hospitals: 205 

5.3 Canonical counts (HC and SIP parallel)

Code
resolve_counts <- function(df) {
  df %>%
    mutate(
      anes_hc_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_anes_n),
        !is.na(hc_anes_n)          ~ as.numeric(hc_anes_n),
        TRUE                        ~ 0),
      anes_hc_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_hc"),
      anes_sip_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_anes_n),
        !is.na(sip_anes_n)         ~ as.numeric(sip_anes_n),
        TRUE                        ~ 0),
      anes_sip_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_sip"),

      surg_hc_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_surg_n),
        !is.na(hc_surg_n)          ~ as.numeric(hc_surg_n),
        TRUE                        ~ 0),
      surg_hc_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_hc"),
      surg_sip_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_surg_n),
        !is.na(sip_surg_n)         ~ as.numeric(sip_surg_n),
        TRUE                        ~ 0),
      surg_sip_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_sip"),

      obgyn_hc_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_obgyn_n),
        !is.na(hc_obgyn_n)         ~ as.numeric(hc_obgyn_n),
        TRUE                        ~ 0),
      obgyn_hc_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_hc"),
      obgyn_sip_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_obgyn_n),
        !is.na(sip_obgyn_n)        ~ as.numeric(sip_obgyn_n),
        TRUE                        ~ 0),
      obgyn_sip_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_sip"),

      ortho_hc_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_ortho_n),
        !is.na(hc_ortho_n)         ~ as.numeric(hc_ortho_n),
        TRUE                        ~ 0),
      ortho_hc_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_hc"),
      ortho_sip_n = case_when(
        match_type == "sirs_only" ~ as.numeric(sirs_ortho_n),
        !is.na(sip_ortho_n)        ~ as.numeric(sip_ortho_n),
        TRUE                        ~ 0),
      ortho_sip_src = if_else(match_type == "sirs_only", "sirs_fallback", "dreams_sip")
    )
}

hosp_all <- bind_rows(
  dreams_joined  %>% resolve_counts(),
  sirs_only_rows %>% resolve_counts()
)

cat("hosp_all rows:", nrow(hosp_all), "\n")
hosp_all rows: 3354 
Code
hosp_all %>% count(match_type) %>% print()
# A tibble: 6 × 2
  match_type              n
  <chr>               <int>
1 dreams_unmatched       50
2 exact                3089
3 fuzzy                   5
4 manual_override         3
5 placeholder_sirs_id     2
6 sirs_only             205

5.4 Teams and BW combo labels

Code
# S = Surgery (gen surg + ortho, combined), A = Anesthesia, O = Obgyn
#
# Categorization (5 booleans per hospital):
#   sao_by_hc   — SAO by HC, all three >= 1
#   sao_by_sip  — SAO by SIP, all three >= 1
#   so_by_hc    — S + O by HC, anesthesia substituted by penata/perawat anestesi
#   sao_min2    — SAO by SIP, each >= 2 (minimum team strength per specialty)
#   so_min2     — S + O by SIP, each >= 2
#
# Plus a "has specialist but no bed" diagnostic flag (reportable edge case).

hosp_workforce <- hosp_all %>%
  mutate(
    # Combined Surgery = gen surg + ortho (both HC and SIP)
    surg_comb_hc_n  = coalesce(surg_hc_n, 0)  + coalesce(ortho_hc_n, 0),
    surg_comb_sip_n = coalesce(surg_sip_n, 0) + coalesce(ortho_sip_n, 0),

    # Specialist-pair team min counts (kept for compatibility with density aggregates)
    team_gensurg_hc_n  = pmin(coalesce(anes_hc_n,  0), coalesce(surg_hc_n,  0)),
    team_obg_hc_n      = pmin(coalesce(anes_hc_n,  0), coalesce(obgyn_hc_n, 0)),
    team_ortho_hc_n    = pmin(coalesce(anes_hc_n,  0), coalesce(ortho_hc_n, 0)),
    team_gensurg_sip_n = pmin(coalesce(anes_sip_n, 0), coalesce(surg_sip_n,  0)),
    team_obg_sip_n     = pmin(coalesce(anes_sip_n, 0), coalesce(obgyn_sip_n, 0)),
    team_ortho_sip_n   = pmin(coalesce(anes_sip_n, 0), coalesce(ortho_sip_n, 0)),

    # SAO classifications — booleans (threshold >=1 for 1a/1b/2, >=2 for 3/4)
    sao_by_hc  = coalesce(surg_comb_hc_n,  0) >= 1 &
                 coalesce(anes_hc_n,       0) >= 1 &
                 coalesce(obgyn_hc_n,      0) >= 1,
    sao_by_sip = coalesce(surg_comb_sip_n, 0) >= 1 &
                 coalesce(anes_sip_n,      0) >= 1 &
                 coalesce(obgyn_sip_n,     0) >= 1,
    so_by_hc   = coalesce(surg_comb_hc_n,  0) >= 1 &
                 coalesce(obgyn_hc_n,      0) >= 1,
    sao_min2   = coalesce(surg_comb_sip_n, 0) >= 2 &
                 coalesce(anes_sip_n,      0) >= 2 &
                 coalesce(obgyn_sip_n,     0) >= 2,
    so_min2    = coalesce(surg_comb_sip_n, 0) >= 2 &
                 coalesce(obgyn_sip_n,     0) >= 2,

    # Composite mutually-exclusive capability hierarchy — SIP view
    sao_tier_sip = dplyr::case_when(
      coalesce(surg_comb_sip_n, 0) >= 2 &
        coalesce(anes_sip_n,    0) >= 2 &
        coalesce(obgyn_sip_n,   0) >= 2 ~ "SAO ≥2",
      coalesce(surg_comb_sip_n, 0) >= 1 &
        coalesce(anes_sip_n,    0) >= 1 &
        coalesce(obgyn_sip_n,   0) >= 1 ~ "SAO ≥1",
      coalesce(surg_comb_sip_n, 0) >= 1 &
        coalesce(obgyn_sip_n,   0) >= 1 ~ "SO only",
      coalesce(surg_comb_sip_n, 0) >= 1 ~ "S only",
      coalesce(obgyn_sip_n,     0) >= 1 ~ "O only",
      TRUE                               ~ "None"
    ),

    # Composite mutually-exclusive capability hierarchy — HC view
    sao_tier_hc = dplyr::case_when(
      coalesce(surg_comb_hc_n, 0) >= 2 &
        coalesce(anes_hc_n,    0) >= 2 &
        coalesce(obgyn_hc_n,   0) >= 2 ~ "SAO ≥2",
      coalesce(surg_comb_hc_n, 0) >= 1 &
        coalesce(anes_hc_n,    0) >= 1 &
        coalesce(obgyn_hc_n,   0) >= 1 ~ "SAO ≥1",
      coalesce(surg_comb_hc_n, 0) >= 1 &
        coalesce(obgyn_hc_n,   0) >= 1 ~ "SO only",
      coalesce(surg_comb_hc_n, 0) >= 1 ~ "S only",
      coalesce(obgyn_hc_n,     0) >= 1 ~ "O only",
      TRUE                              ~ "None"
    ),

    # Edge case flag: any S/A/O specialist but zero beds
    any_sao_specialist = coalesce(surg_comb_hc_n, 0) > 0 |
                         coalesce(anes_hc_n,       0) > 0 |
                         coalesce(obgyn_hc_n,      0) > 0 |
                         coalesce(surg_comb_sip_n, 0) > 0 |
                         coalesce(anes_sip_n,      0) > 0 |
                         coalesce(obgyn_sip_n,     0) > 0,
    has_specialist_no_bed = any_sao_specialist & coalesce(bed_total, 0) == 0
  ) %>%
  relocate(
    matched_hospital_id, matched_hospital_name, match_type,
    province_key, regency_key, ownership, class,
    bed_present, bed_bellwether, bed_total,
    anes_hc_n, anes_sip_n,
    surg_hc_n, surg_sip_n, ortho_hc_n, ortho_sip_n,
    surg_comb_hc_n, surg_comb_sip_n,
    obgyn_hc_n, obgyn_sip_n,
    team_gensurg_hc_n, team_gensurg_sip_n,
    team_obg_hc_n,     team_obg_sip_n,
    team_ortho_hc_n,   team_ortho_sip_n,
    sao_by_hc, sao_by_sip, so_by_hc, sao_min2, so_min2,
    has_specialist_no_bed,
    .before = everything()
  )

cat("SAO classifications (hospital counts):\n")
SAO classifications (hospital counts):
Code
hosp_workforce %>%
  summarise(across(c(sao_by_hc, sao_by_sip, so_by_hc, sao_min2, so_min2),
                   ~ sum(.x, na.rm = TRUE))) %>%
  pivot_longer(everything(), names_to = "category", values_to = "n_hospitals") %>%
  mutate(pct_of_total = round(100 * n_hospitals / nrow(hosp_workforce), 1)) %>%
  print()
# A tibble: 5 × 3
  category   n_hospitals pct_of_total
  <chr>            <int>        <dbl>
1 sao_by_hc         1183         35.3
2 sao_by_sip        2700         80.5
3 so_by_hc          1504         44.8
4 sao_min2          1602         47.8
5 so_min2           1894         56.5
Code
cat("\nHospitals with specialist but no bed (diagnostic flag):\n")

Hospitals with specialist but no bed (diagnostic flag):
Code
cat("  n =", sum(hosp_workforce$has_specialist_no_bed, na.rm = TRUE), "\n")
  n = 62 

5.5 Attach island grouping

Code
island_lookup <- tribble(
  ~province_key,                 ~island,
  "ACEH",                         "Sumatera",
  "SUMATERA UTARA",               "Sumatera",
  "SUMATERA BARAT",               "Sumatera",
  "RIAU",                         "Sumatera",
  "KEPULAUAN RIAU",               "Sumatera",
  "JAMBI",                        "Sumatera",
  "SUMATERA SELATAN",             "Sumatera",
  "KEPULAUAN BANGKA BELITUNG",    "Sumatera",
  "BENGKULU",                     "Sumatera",
  "LAMPUNG",                      "Sumatera",
  "DKI JAKARTA",                  "Jawa",
  "JAWA BARAT",                   "Jawa",
  "BANTEN",                       "Jawa",
  "JAWA TENGAH",                  "Jawa",
  "DI YOGYAKARTA",                "Jawa",
  "JAWA TIMUR",                   "Jawa",
  "BALI",                         "Bali & Nusa Tenggara",
  "NUSA TENGGARA BARAT",          "Bali & Nusa Tenggara",
  "NUSA TENGGARA TIMUR",          "Bali & Nusa Tenggara",
  "KALIMANTAN BARAT",             "Kalimantan",
  "KALIMANTAN TENGAH",            "Kalimantan",
  "KALIMANTAN SELATAN",           "Kalimantan",
  "KALIMANTAN TIMUR",             "Kalimantan",
  "KALIMANTAN UTARA",             "Kalimantan",
  "SULAWESI UTARA",               "Sulawesi",
  "SULAWESI TENGAH",              "Sulawesi",
  "SULAWESI SELATAN",             "Sulawesi",
  "SULAWESI TENGGARA",            "Sulawesi",
  "SULAWESI BARAT",               "Sulawesi",
  "GORONTALO",                    "Sulawesi",
  "MALUKU",                       "Maluku",
  "MALUKU UTARA",                 "Maluku",
  "PAPUA",                        "Papua",
  "PAPUA BARAT",                  "Papua",
  "PAPUA SELATAN",                "Papua",
  "PAPUA TENGAH",                 "Papua",
  "PAPUA PEGUNUNGAN",             "Papua",
  "PAPUA BARAT DAYA",             "Papua"
)

hosp_workforce <- hosp_workforce %>%
  left_join(island_lookup, by = "province_key") %>%
  relocate(island, .after = regency_key)

cat("Unmapped provinces (should be empty):\n")
Unmapped provinces (should be empty):
Code
hosp_workforce %>% filter(is.na(island)) %>% count(province_key) %>% print()
# A tibble: 0 × 2
# ℹ 2 variables: province_key <chr>, n <int>
Code
# Retro-attach island to the saved crosswalk (island_lookup only defined at this point)
kab_crosswalk <- kab_crosswalk %>%
  left_join(island_lookup, by = c("prov_name_canonical" = "province_key")) %>%
  relocate(island, .after = prov_name_canonical)
saveRDS(kab_crosswalk, file.path(OUTPUT_DIR, "kab_crosswalk.rds"))
cat("Re-saved kab_crosswalk.rds with island column\n")
Re-saved kab_crosswalk.rds with island column

6 Build district-level summary

6.1 Eligibility flag

Code
# Attach sirs_empty flag from hosp_sirs so it carries through the joined workforce dataset
hosp_workforce <- hosp_workforce %>%
  left_join(hosp_sirs %>% select(hospital_id, sirs_empty, sirs_error),
            by = c("matched_hospital_id" = "hospital_id")) %>%
  mutate(
    sirs_empty = coalesce(sirs_empty, FALSE),
    hosp_eligible = case_when(
      sirs_empty                                                  ~ 0L,   # no SIRS data submitted
      !is.na(class) & bed_total > 0                               ~ 1L,
      match_type == "sirs_only" & !is.na(class) & bed_total > 0   ~ 1L,
      TRUE                                                        ~ 0L
    )
  )

cat("Hospitals flagged sirs_empty (excluded from SAO):",
    sum(hosp_workforce$sirs_empty, na.rm = TRUE), "\n")
Hospitals flagged sirs_empty (excluded from SAO): 13 

6.2 Aggregate

Code
kab_workforce <- hosp_workforce %>%
  filter(!is.na(regency_key)) %>%
  group_by(province_final = province_key, regency_final = regency_key, island) %>%
  summarise(
    n_hosp          = n(),
    n_hosp_eligible = sum(hosp_eligible, na.rm = TRUE),

    # Raw specialty counts (per-hospital sums)
    anes_hc_raw       = sum(anes_hc_n,       na.rm = TRUE),
    anes_sip_raw      = sum(anes_sip_n,      na.rm = TRUE),
    surg_hc_raw       = sum(surg_hc_n,       na.rm = TRUE),
    surg_sip_raw      = sum(surg_sip_n,      na.rm = TRUE),
    obgyn_hc_raw      = sum(obgyn_hc_n,      na.rm = TRUE),
    obgyn_sip_raw     = sum(obgyn_sip_n,     na.rm = TRUE),
    ortho_hc_raw      = sum(ortho_hc_n,      na.rm = TRUE),
    ortho_sip_raw     = sum(ortho_sip_n,     na.rm = TRUE),

    # Combined S = surg + ortho
    surg_comb_hc_raw  = sum(surg_comb_hc_n,  na.rm = TRUE),
    surg_comb_sip_raw = sum(surg_comb_sip_n, na.rm = TRUE),

    # Specialist-pair team minimums (retained for density aggregates)
    team_gensurg_hc_raw  = sum(team_gensurg_hc_n,  na.rm = TRUE),
    team_gensurg_sip_raw = sum(team_gensurg_sip_n, na.rm = TRUE),
    team_obg_hc_raw      = sum(team_obg_hc_n,      na.rm = TRUE),
    team_obg_sip_raw     = sum(team_obg_sip_n,     na.rm = TRUE),
    team_ortho_hc_raw    = sum(team_ortho_hc_n,    na.rm = TRUE),
    team_ortho_sip_raw   = sum(team_ortho_sip_n,   na.rm = TRUE),

    # SAO hospital-capability counts (n hospitals passing each criterion in the kab)
    n_sao_by_hc   = sum(sao_by_hc,  na.rm = TRUE),
    n_sao_by_sip  = sum(sao_by_sip, na.rm = TRUE),
    n_so_by_hc    = sum(so_by_hc,   na.rm = TRUE),
    n_sao_min2    = sum(sao_min2,   na.rm = TRUE),
    n_so_min2     = sum(so_min2,    na.rm = TRUE),

    # Composite tier counts — SIP
    n_tier_sip_sao2  = sum(sao_tier_sip == "SAO ≥2",  na.rm = TRUE),
    n_tier_sip_sao1  = sum(sao_tier_sip == "SAO ≥1",  na.rm = TRUE),
    n_tier_sip_so    = sum(sao_tier_sip == "SO only",  na.rm = TRUE),
    n_tier_sip_sonly = sum(sao_tier_sip == "S only",   na.rm = TRUE),
    n_tier_sip_oonly = sum(sao_tier_sip == "O only",   na.rm = TRUE),
    n_tier_sip_none  = sum(sao_tier_sip == "None",     na.rm = TRUE),

    # Composite tier counts — HC
    n_tier_hc_sao2   = sum(sao_tier_hc == "SAO ≥2",   na.rm = TRUE),
    n_tier_hc_sao1   = sum(sao_tier_hc == "SAO ≥1",   na.rm = TRUE),
    n_tier_hc_so     = sum(sao_tier_hc == "SO only",   na.rm = TRUE),
    n_tier_hc_sonly  = sum(sao_tier_hc == "S only",    na.rm = TRUE),
    n_tier_hc_oonly  = sum(sao_tier_hc == "O only",    na.rm = TRUE),
    n_tier_hc_none   = sum(sao_tier_hc == "None",      na.rm = TRUE),

    # Edge case diagnostic
    n_specialist_no_bed = sum(has_specialist_no_bed, na.rm = TRUE),

    .groups = "drop"
  ) %>%
  left_join(denom_clean, by = c("province_final" = "province_key",
                                 "regency_final"  = "regency_key"))

6.3 Densities

Code
div_safe <- function(num, den) if_else(den > 0, num / den, NA_real_)

kab_workforce <- kab_workforce %>%
  mutate(
    across(ends_with("_raw"), ~ div_safe(.x, n_hosp),          .names = "{sub('_raw$', '_dens_hosp', .col)}"),
    across(ends_with("_raw"), ~ div_safe(.x, n_hosp_eligible), .names = "{sub('_raw$', '_dens_hosp_adj', .col)}"),
    across(ends_with("_raw"), ~ div_safe(.x, pop) * 1e5,       .names = "{sub('_raw$', '_per_100k', .col)}")
  )

cat("kab_workforce rows:", nrow(kab_workforce), " | cols:", ncol(kab_workforce), "\n")
kab_workforce rows: 513  | cols: 88 
Code
cat("Districts without BPJS pop:", sum(is.na(kab_workforce$pop)), "\n")
Districts without BPJS pop: 2 

7 Sanity checks

Code
cat("Match type:\n"); hosp_workforce %>% count(match_type) %>% print()
Match type:
# A tibble: 6 × 2
  match_type              n
  <chr>               <int>
1 dreams_unmatched       50
2 exact                3089
3 fuzzy                   5
4 manual_override         3
5 placeholder_sirs_id     2
6 sirs_only             205
Code
cat("\nNational totals — HC vs SIP:\n")

National totals — HC vs SIP:
Code
hosp_workforce %>%
  summarise(across(c(anes_hc_n, anes_sip_n, surg_hc_n, surg_sip_n,
                     obgyn_hc_n, obgyn_sip_n, ortho_hc_n, ortho_sip_n,
                     surg_comb_hc_n, surg_comb_sip_n),
                   ~ sum(., na.rm = TRUE))) %>%
  pivot_longer(everything()) %>% print()
# A tibble: 10 × 2
   name            value
   <chr>           <dbl>
 1 anes_hc_n        3419
 2 anes_sip_n       8005
 3 surg_hc_n        3622
 4 surg_sip_n       8193
 5 obgyn_hc_n       5326
 6 obgyn_sip_n     11232
 7 ortho_hc_n       1522
 8 ortho_sip_n      3827
 9 surg_comb_hc_n   5144
10 surg_comb_sip_n 12020
Code
cat("\nSAO capability distribution (hospital-level):\n")

SAO capability distribution (hospital-level):
Code
hosp_workforce %>%
  summarise(across(c(sao_by_hc, sao_by_sip, so_by_hc, sao_min2, so_min2),
                   ~ sum(.x, na.rm = TRUE))) %>%
  pivot_longer(everything(), names_to = "category", values_to = "n_hospitals") %>%
  mutate(pct = round(100 * n_hospitals / nrow(hosp_workforce), 1)) %>% print()
# A tibble: 5 × 3
  category   n_hospitals   pct
  <chr>            <int> <dbl>
1 sao_by_hc         1183  35.3
2 sao_by_sip        2700  80.5
3 so_by_hc          1504  44.8
4 sao_min2          1602  47.8
5 so_min2           1894  56.5
Code
cat("\nHospitals with specialist but no bed:\n")

Hospitals with specialist but no bed:
Code
cat("  n =", sum(hosp_workforce$has_specialist_no_bed, na.rm = TRUE), "\n")
  n = 62 
Code
cat("\nIsland coverage:\n"); kab_workforce %>% count(island) %>% print()

Island coverage:
# A tibble: 7 × 2
  island                   n
  <chr>                <int>
1 Bali & Nusa Tenggara    41
2 Jawa                   119
3 Kalimantan              56
4 Maluku                  21
5 Papua                   41
6 Sulawesi                81
7 Sumatera               154

8 Export

Code
saveRDS(hosp_workforce,     file.path(OUTPUT_DIR, "hosp_workforce.rds"))
saveRDS(kab_workforce,      file.path(OUTPUT_DIR, "kab_workforce.rds"))
saveRDS(specialty_mapping,  file.path(OUTPUT_DIR, "specialty_mapping.rds"))
saveRDS(island_lookup,      file.path(OUTPUT_DIR, "island_lookup.rds"))

cat("Exported:\n")
Exported:
Code
cat("  hosp_workforce.rds    —", nrow(hosp_workforce),    "rows,", ncol(hosp_workforce),    "cols\n")
  hosp_workforce.rds    — 3354 rows, 73 cols
Code
cat("  kab_workforce.rds     —", nrow(kab_workforce),     "rows,", ncol(kab_workforce),     "cols\n")
  kab_workforce.rds     — 513 rows, 88 cols
Code
cat("  specialty_mapping.rds —", nrow(specialty_mapping), "rows\n")
  specialty_mapping.rds — 70 rows
Code
cat("  island_lookup.rds     —", nrow(island_lookup),     "rows\n")
  island_lookup.rds     — 38 rows