library(data.table)
#>
#> Attaching package: 'data.table'
#> The following object is masked from 'package:base':
#>
#> %notin%Introduction
This vignette demonstrates how to build a skeleton by hand – creating the time-structured framework, integrating raw registry data, and deriving analysis-ready variables.
Prerequisites: If you’re new to swereg, start with the “Skeleton concept” vignette to learn the conceptual foundation.
For production pipelines that process hundreds of thousands of
individuals with incremental rebuild support, see
vignette("skeleton-pipeline").
Step 1: create the time grid
Start by creating a skeleton with your study population and time period:
# Load example data
data("fake_person_ids", package = "swereg")
# Create skeleton covering 2015-2020
skeleton <- swereg::create_skeleton(
ids = fake_person_ids,
date_min = "2015-01-01",
date_max = "2020-12-31"
)
# Examine the structure
head(skeleton)
#> id isoyear isoyearweek is_isoyear isoyearweeksun personyears
#> <int> <int> <char> <lgcl> <Date> <num>
#> 1: 1 1900 1900-** TRUE 1900-07-01 1
#> 2: 1 1901 1901-** TRUE 1901-06-30 1
#> 3: 1 1902 1902-** TRUE 1902-06-29 1
#> 4: 1 1903 1903-** TRUE 1903-06-28 1
#> 5: 1 1904 1904-** TRUE 1904-07-03 1
#> 6: 1 1905 1905-** TRUE 1905-07-02 1
cat("Skeleton dimensions:", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
#> Skeleton dimensions: 430000 rows, 6 columnsThe skeleton contains:
-
id: Individual identifier -
isoyear: ISO year (for annual data) -
isoyearweek: ISO year-week (for weekly data, format “YYYY-WW” or “YYYY-**” for annual rows) -
is_isoyear: Boolean indicating annual vs weekly rows -
isoyearweeksun: Date representing the Sunday (last day) of the ISO week/year
Step 2: add demographic data (one-time)
Demographics don’t change over time, so we add them once per person:
# Load and prepare demographic data
fake_demographics <- swereg::fake_demographics |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "fodelseman")
# Add to skeleton
swereg::add_onetime(skeleton, fake_demographics, id_name = "lopnr")
# Check what was added
new_vars <- setdiff(names(skeleton), c("id", "isoyear", "isoyearweek", "is_isoyear", "isoyearweeksun"))
cat("Added demographic variables:", paste(new_vars, collapse = ", "), "\n")
#> Added demographic variables: personyears, fodelseman, doddatumStep 3: add annual data
Some data varies by year (e.g., family status, income):
# Load annual family data
fake_annual_family <- swereg::fake_annual_family |>
data.table::copy() |>
swereg::make_lowercase_names()
# Add annual data for 2015
swereg::add_annual(skeleton, fake_annual_family, id_name = "lopnr", isoyear = 2015)
cat("Annual data added for 2015\n")
#> Annual data added for 2015Step 4: add diagnosis data
Hospital diagnoses drive most epidemiological studies:
# Load and prepare diagnosis data
fake_diagnoses <- swereg::fake_diagnoses |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "indatum")
#> Found additional date columns not in date_columns: utdatum. Consider adding them for automatic date parsing.
# Define diagnosis patterns to search for (^ prefix automatically added)
diagnosis_patterns <- list(
"depression" = c("F32", "F33"),
"anxiety" = c("F40", "F41"),
"gender_dysphoria" = c("F64"),
"psychosis" = c("F20", "F25"),
"cardiovascular" = c("I10", "I20", "I21")
)
# Add diagnoses to skeleton
swereg::add_diagnoses(
skeleton,
fake_diagnoses,
id_name = "lopnr",
diags = diagnosis_patterns
)
#> Warning: 'diags' is deprecated, use 'codes' instead.
# Check results
diag_vars <- names(diagnosis_patterns)
for(var in diag_vars) {
count <- sum(skeleton[[var]], na.rm = TRUE)
cat("-", var, ":", count, "positive cases\n")
}
#> - depression : 322 positive cases
#> - anxiety : 314 positive cases
#> - gender_dysphoria : 461 positive cases
#> - psychosis : 345 positive cases
#> - cardiovascular : 208 positive casesStep 5: add prescription data
Medication data with treatment duration and ATC code patterns:
# Load prescription data
fake_prescriptions <- swereg::fake_prescriptions |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "edatum")
# Define drug patterns (ATC codes, ^ prefix automatically added)
drug_patterns <- list(
"antidepressants" = c("N06A"),
"antipsychotics" = c("N05A"),
"hormones" = c("G03"),
"cardiovascular_drugs" = c("C07", "C08", "C09")
)
# Add prescriptions to skeleton
swereg::add_rx(
skeleton,
fake_prescriptions,
id_name = "p444_lopnr_personnr",
rxs = drug_patterns
)
#> Warning: 'rxs' is deprecated, use 'codes' instead.
# Check prescription usage
rx_vars <- names(drug_patterns)
for(var in rx_vars) {
count <- sum(skeleton[[var]], na.rm = TRUE)
cat("-", var, ":", count, "prescription periods\n")
}
#> - antidepressants : 4066 prescription periods
#> - antipsychotics : 2938 prescription periods
#> - hormones : 15124 prescription periods
#> - cardiovascular_drugs : 2165 prescription periodsStep 6: add surgical operation data
Surgical procedures from hospital records:
# Add operations (using default gender-affirming surgery codes)
swereg::add_operations(skeleton, fake_diagnoses, "lopnr")
# Check operation counts
operation_vars <- grep("^op_", names(skeleton), value = TRUE)
cat("Operation variables added:", length(operation_vars), "\n")
#> Operation variables added: 9
for(var in operation_vars[1:3]) { # Show first 3
count <- sum(skeleton[[var]], na.rm = TRUE)
cat("-", var, ":", count, "procedures\n")
}
#> - op_afab_mastectomy : 250 procedures
#> - op_afab_breast_reconst_and_other_breast_ops : 0 procedures
#> - op_afab_penis_test_prosth : 0 proceduresStep 7: add cause of death data
For mortality studies:
# Load cause of death data
fake_cod <- swereg::fake_cod |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "dodsdat")
# Define cause of death patterns (^ prefix automatically added)
cod_patterns <- list(
"cardiovascular_death" = c("I21", "I22"),
"external_causes" = c("X60", "X70")
)
# Add to skeleton
swereg::add_cods(
skeleton,
fake_cod,
id_name = "lopnr",
cods = cod_patterns
)
#> Warning: 'cods' is deprecated, use 'codes' instead.
# Check mortality
cod_vars <- names(cod_patterns)
for(var in cod_vars) {
count <- sum(skeleton[[var]], na.rm = TRUE)
cat("-", var, ":", count, "deaths\n")
}
#> - cardiovascular_death : 16 deaths
#> - external_causes : 9 deathsStep 8: derive variables
With all registry data integrated, create analysis-ready variables using only data already in the skeleton.
Age
skeleton[, birth_year := as.numeric(substr(fodelseman, 1, 4))]
skeleton[, age := isoyear - birth_year]Composite clinical indicators
skeleton[, any_mental_health := depression | anxiety | psychosis]
skeleton[, severe_mental_illness := psychosis | gender_dysphoria]Treatment concordance
skeleton[, depression_treated := depression & antidepressants]
skeleton[, psychosis_treated := psychosis & antipsychotics]Life stage
skeleton[, life_stage := fcase(
age < 18, "child",
age >= 18 & age < 65, "adult",
age >= 65, "elderly",
default = "unknown"
)]
cat("Life stage distribution:\n")
#> Life stage distribution:
print(table(skeleton[is_isoyear == TRUE]$life_stage, useNA = "ifany"))
#>
#> adult child elderly
#> 20628 95355 17Row-independent first-occurrence variables
# Year of first depression diagnosis
swereg::make_rowind_first_occurrence(
skeleton,
condition = "depression == TRUE",
value_var = "isoyear",
new_var = "ri_isoyear_first_depression"
)
# Age at first depression diagnosis
swereg::make_rowind_first_occurrence(
skeleton,
condition = "depression == TRUE",
value_var = "age",
new_var = "ri_age_first_depression"
)The finished skeleton
The skeleton now contains all registry data integrated and all derived variables ready for analysis:
cat("Variables:", paste(names(skeleton), collapse = ", "), "\n")
#> Variables: id, isoyear, isoyearweek, is_isoyear, isoyearweeksun, personyears, doddatum, famtyp, depression, anxiety, gender_dysphoria, psychosis, cardiovascular, antidepressants, antipsychotics, hormones, cardiovascular_drugs, op_afab_mastectomy, op_afab_breast_reconst_and_other_breast_ops, op_afab_penis_test_prosth, op_afab_internal_genital, op_afab_colpectomy, op_amab_breast_reconst_and_other_breast_ops, op_amab_reconst_vag, op_amab_penis_amp, op_amab_larynx, cardiovascular_death, external_causes, age, any_mental_health, severe_mental_illness, depression_treated, psychosis_treated, life_stage, death_any, ri_isoyear_first_depression, ri_age_first_depression
# Example: depression prevalence by life stage
depression_summary <- skeleton[is_isoyear == TRUE & isoyear >= 2015, .(
n_person_years = .N,
depression_prev = mean(depression, na.rm = TRUE),
treatment_rate = ifelse(sum(depression, na.rm = TRUE) > 0,
mean(depression_treated[depression == TRUE], na.rm = TRUE),
NA_real_)
), by = .(life_stage)]
print(depression_summary[n_person_years > 0])
#> life_stage n_person_years depression_prev treatment_rate
#> <char> <int> <num> <num>
#> 1: adult 832 0 NA
#> 2: child 151 0 NA
#> 3: elderly 17 0 NAKey principles
-
Always use
make_lowercase_names()after reading registry data - Sequential integration: Add data types in logical order
- Pattern matching: Use regex patterns for medical codes (^ prefix automatically added)
- Derive from within: Variable derivation uses only data already in the skeleton
-
rd_ / ri_ convention: Time-varying variables get
rd_prefix, time-invariant getri_– seevignette("rowdep-rowind-concept")
Next steps
-
Analyse the skeleton: See
vignette("skeleton-analyze")for aggregation and analysis patterns -
Production pipelines: See
vignette("skeleton-pipeline")for the R6-basedRegistryStudyworkflow with incremental rebuilds and batched processing
