Creating the skeleton • swereg

library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following object is masked from 'package:base':
#> 
#>     %notin%

Introduction

This vignette demonstrates how to build a skeleton by hand – creating the time-structured framework, integrating raw registry data, and deriving analysis-ready variables.

Prerequisites: If you’re new to swereg, start with the “Skeleton concept” vignette to learn the conceptual foundation.

For production pipelines that process hundreds of thousands of individuals with incremental rebuild support, see vignette("skeleton-pipeline").

Step 1: create the time grid

Start by creating a skeleton with your study population and time period:

# Load example data
data("fake_person_ids", package = "swereg")

# Create skeleton covering 2015-2020
skeleton <- swereg::create_skeleton(
  ids = fake_person_ids,
  date_min = "2015-01-01",
  date_max = "2020-12-31"
)

# Examine the structure
head(skeleton)
#>       id isoyear isoyearweek is_isoyear isoyearweeksun personyears
#>    <int>   <int>      <char>     <lgcl>         <Date>       <num>
#> 1:     1    1900     1900-**       TRUE     1900-07-01           1
#> 2:     1    1901     1901-**       TRUE     1901-06-30           1
#> 3:     1    1902     1902-**       TRUE     1902-06-29           1
#> 4:     1    1903     1903-**       TRUE     1903-06-28           1
#> 5:     1    1904     1904-**       TRUE     1904-07-03           1
#> 6:     1    1905     1905-**       TRUE     1905-07-02           1
cat("Skeleton dimensions:", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
#> Skeleton dimensions: 430000 rows, 6 columns

The skeleton contains:

id: Individual identifier
isoyear: ISO year (for annual data)
isoyearweek: ISO year-week (for weekly data, format “YYYY-WW” or “YYYY-**” for annual rows)
is_isoyear: Boolean indicating annual vs weekly rows
isoyearweeksun: Date representing the Sunday (last day) of the ISO week/year

Step 2: add demographic data (one-time)

Demographics don’t change over time, so we add them once per person:

# Load and prepare demographic data
fake_demographics <- swereg::fake_demographics |>
  data.table::copy() |>
  swereg::make_lowercase_names(date_columns = "fodelseman")

# Add to skeleton
swereg::add_onetime(skeleton, fake_demographics, id_name = "lopnr")

# Check what was added
new_vars <- setdiff(names(skeleton), c("id", "isoyear", "isoyearweek", "is_isoyear", "isoyearweeksun"))
cat("Added demographic variables:", paste(new_vars, collapse = ", "), "\n")
#> Added demographic variables: personyears, fodelseman, doddatum

Step 3: add annual data

Some data varies by year (e.g., family status, income):

# Load annual family data
fake_annual_family <- swereg::fake_annual_family |>
  data.table::copy() |>
  swereg::make_lowercase_names()

# Add annual data for 2015
swereg::add_annual(skeleton, fake_annual_family, id_name = "lopnr", isoyear = 2015)

cat("Annual data added for 2015\n")
#> Annual data added for 2015

Step 4: add diagnosis data

Hospital diagnoses drive most epidemiological studies:

# Load and prepare diagnosis data
fake_diagnoses <- swereg::fake_diagnoses |>
  data.table::copy() |>
  swereg::make_lowercase_names(date_columns = "indatum")
#> Found additional date columns not in date_columns: utdatum. Consider adding them for automatic date parsing.

# Define diagnosis patterns to search for (^ prefix automatically added)
diagnosis_patterns <- list(
  "depression" = c("F32", "F33"),
  "anxiety" = c("F40", "F41"),
  "gender_dysphoria" = c("F64"),
  "psychosis" = c("F20", "F25"),
  "cardiovascular" = c("I10", "I20", "I21")
)

# Add diagnoses to skeleton
swereg::add_diagnoses(
  skeleton,
  fake_diagnoses,
  id_name = "lopnr",
  diags = diagnosis_patterns
)
#> Warning: 'diags' is deprecated, use 'codes' instead.

# Check results
diag_vars <- names(diagnosis_patterns)
for(var in diag_vars) {
  count <- sum(skeleton[[var]], na.rm = TRUE)
  cat("-", var, ":", count, "positive cases\n")
}
#> - depression : 322 positive cases
#> - anxiety : 314 positive cases
#> - gender_dysphoria : 461 positive cases
#> - psychosis : 345 positive cases
#> - cardiovascular : 208 positive cases

Step 5: add prescription data

Medication data with treatment duration and ATC code patterns:

# Load prescription data
fake_prescriptions <- swereg::fake_prescriptions |>
  data.table::copy() |>
  swereg::make_lowercase_names(date_columns = "edatum")

# Define drug patterns (ATC codes, ^ prefix automatically added)
drug_patterns <- list(
  "antidepressants" = c("N06A"),
  "antipsychotics" = c("N05A"),
  "hormones" = c("G03"),
  "cardiovascular_drugs" = c("C07", "C08", "C09")
)

# Add prescriptions to skeleton
swereg::add_rx(
  skeleton,
  fake_prescriptions,
  id_name = "p444_lopnr_personnr",
  rxs = drug_patterns
)
#> Warning: 'rxs' is deprecated, use 'codes' instead.

# Check prescription usage
rx_vars <- names(drug_patterns)
for(var in rx_vars) {
  count <- sum(skeleton[[var]], na.rm = TRUE)
  cat("-", var, ":", count, "prescription periods\n")
}
#> - antidepressants : 4066 prescription periods
#> - antipsychotics : 2938 prescription periods
#> - hormones : 15124 prescription periods
#> - cardiovascular_drugs : 2165 prescription periods

Step 6: add surgical operation data

Surgical procedures from hospital records:

# Add operations (using default gender-affirming surgery codes)
swereg::add_operations(skeleton, fake_diagnoses, "lopnr")

# Check operation counts
operation_vars <- grep("^op_", names(skeleton), value = TRUE)
cat("Operation variables added:", length(operation_vars), "\n")
#> Operation variables added: 9
for(var in operation_vars[1:3]) {  # Show first 3
  count <- sum(skeleton[[var]], na.rm = TRUE)
  cat("-", var, ":", count, "procedures\n")
}
#> - op_afab_mastectomy : 250 procedures
#> - op_afab_breast_reconst_and_other_breast_ops : 0 procedures
#> - op_afab_penis_test_prosth : 0 procedures

Step 7: add cause of death data

For mortality studies:

# Load cause of death data
fake_cod <- swereg::fake_cod |>
  data.table::copy() |>
  swereg::make_lowercase_names(date_columns = "dodsdat")

# Define cause of death patterns (^ prefix automatically added)
cod_patterns <- list(
  "cardiovascular_death" = c("I21", "I22"),
  "external_causes" = c("X60", "X70")
)

# Add to skeleton
swereg::add_cods(
  skeleton,
  fake_cod,
  id_name = "lopnr",
  cods = cod_patterns
)
#> Warning: 'cods' is deprecated, use 'codes' instead.

# Check mortality
cod_vars <- names(cod_patterns)
for(var in cod_vars) {
  count <- sum(skeleton[[var]], na.rm = TRUE)
  cat("-", var, ":", count, "deaths\n")
}
#> - cardiovascular_death : 16 deaths
#> - external_causes : 9 deaths

Step 8: derive variables

With all registry data integrated, create analysis-ready variables using only data already in the skeleton.

Age

skeleton[, birth_year := as.numeric(substr(fodelseman, 1, 4))]
skeleton[, age := isoyear - birth_year]

Composite clinical indicators

skeleton[, any_mental_health := depression | anxiety | psychosis]
skeleton[, severe_mental_illness := psychosis | gender_dysphoria]

Treatment concordance

skeleton[, depression_treated := depression & antidepressants]
skeleton[, psychosis_treated := psychosis & antipsychotics]

Life stage

skeleton[, life_stage := fcase(
  age < 18, "child",
  age >= 18 & age < 65, "adult",
  age >= 65, "elderly",
  default = "unknown"
)]

cat("Life stage distribution:\n")
#> Life stage distribution:
print(table(skeleton[is_isoyear == TRUE]$life_stage, useNA = "ifany"))
#> 
#>   adult   child elderly 
#>   20628   95355      17

Outcome variables

skeleton[, death_any := external_causes | cardiovascular_death]
cat("Any death:", sum(skeleton$death_any, na.rm = TRUE), "deaths\n")
#> Any death: 22 deaths

Row-independent first-occurrence variables

# Year of first depression diagnosis
swereg::make_rowind_first_occurrence(
  skeleton,
  condition = "depression == TRUE",
  value_var = "isoyear",
  new_var = "ri_isoyear_first_depression"
)

# Age at first depression diagnosis
swereg::make_rowind_first_occurrence(
  skeleton,
  condition = "depression == TRUE",
  value_var = "age",
  new_var = "ri_age_first_depression"
)

Step 9: quality filters

Apply study criteria:

cat("Before filtering:", nrow(skeleton), "rows\n")
#> Before filtering: 430000 rows
skeleton <- skeleton[age >= 0 & age <= 100]
skeleton <- skeleton[isoyear >= 2015]
cat("After filtering:", nrow(skeleton), "rows\n")
#> After filtering: 315000 rows

Step 10: clean up temporary columns

skeleton[, c("fodelseman", "birth_year") := NULL]

cat("Final skeleton:", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
#> Final skeleton: 315000 rows, 37 columns

The finished skeleton

The skeleton now contains all registry data integrated and all derived variables ready for analysis:

cat("Variables:", paste(names(skeleton), collapse = ", "), "\n")
#> Variables: id, isoyear, isoyearweek, is_isoyear, isoyearweeksun, personyears, doddatum, famtyp, depression, anxiety, gender_dysphoria, psychosis, cardiovascular, antidepressants, antipsychotics, hormones, cardiovascular_drugs, op_afab_mastectomy, op_afab_breast_reconst_and_other_breast_ops, op_afab_penis_test_prosth, op_afab_internal_genital, op_afab_colpectomy, op_amab_breast_reconst_and_other_breast_ops, op_amab_reconst_vag, op_amab_penis_amp, op_amab_larynx, cardiovascular_death, external_causes, age, any_mental_health, severe_mental_illness, depression_treated, psychosis_treated, life_stage, death_any, ri_isoyear_first_depression, ri_age_first_depression

# Example: depression prevalence by life stage
depression_summary <- skeleton[is_isoyear == TRUE & isoyear >= 2015, .(
  n_person_years = .N,
  depression_prev = mean(depression, na.rm = TRUE),
  treatment_rate = ifelse(sum(depression, na.rm = TRUE) > 0,
                         mean(depression_treated[depression == TRUE], na.rm = TRUE),
                         NA_real_)
), by = .(life_stage)]

print(depression_summary[n_person_years > 0])
#>    life_stage n_person_years depression_prev treatment_rate
#>        <char>          <int>           <num>          <num>
#> 1:      adult            832               0             NA
#> 2:      child            151               0             NA
#> 3:    elderly             17               0             NA

Key principles

Always use make_lowercase_names() after reading registry data
Sequential integration: Add data types in logical order
Pattern matching: Use regex patterns for medical codes (^ prefix automatically added)
Derive from within: Variable derivation uses only data already in the skeleton
rd_ / ri_ convention: Time-varying variables get rd_ prefix, time-invariant get ri_ – see vignette("rowdep-rowind-concept")

Next steps

Analyse the skeleton: See vignette("skeleton-analyze") for aggregation and analysis patterns
Production pipelines: See vignette("skeleton-pipeline") for the R6-based RegistryStudy workflow with incremental rebuilds and batched processing