library(data.table)
#>
#> Attaching package: 'data.table'
#> The following object is masked from 'package:base':
#>
#> %notin%Introduction
This vignette demonstrates how to work with a finished skeleton – collapsing weekly data to yearly summaries, creating analysis datasets, and running common epidemiological queries.
Prerequisites: Complete
vignette("skeleton-create") first, or use the
RegistryStudy pipeline from
vignette("skeleton-pipeline").
Build a skeleton for this vignette
We create a small skeleton with the same steps as
vignette("skeleton-create"), condensed here for
convenience:
skeleton <- swereg::create_skeleton(swereg::fake_person_ids, "2015-01-01", "2020-12-31")
# Demographics
fake_demographics <- swereg::fake_demographics |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "fodelseman")
swereg::add_onetime(skeleton, fake_demographics, id_name = "lopnr")
# Diagnoses
fake_diagnoses <- swereg::fake_diagnoses |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "indatum")
#> Found additional date columns not in date_columns: utdatum. Consider adding them for automatic date parsing.
swereg::add_diagnoses(skeleton, fake_diagnoses, id_name = "lopnr",
diags = list(
"depression" = c("F32", "F33"),
"anxiety" = c("F40", "F41"),
"gender_dysphoria" = c("F64"),
"psychosis" = c("F20", "F25")
))
#> Warning: 'diags' is deprecated, use 'codes' instead.
# Prescriptions
fake_prescriptions <- swereg::fake_prescriptions |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "edatum")
swereg::add_rx(skeleton, fake_prescriptions, id_name = "p444_lopnr_personnr",
rxs = list(
"antidepressants" = c("N06A"),
"antipsychotics" = c("N05A"),
"hormones" = c("G03")
))
#> Warning: 'rxs' is deprecated, use 'codes' instead.
# Cause of death
fake_cod <- swereg::fake_cod |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "dodsdat")
swereg::add_cods(skeleton, fake_cod, id_name = "lopnr",
cods = list(
"external_death" = c("X60", "X70"),
"cardiovascular_death" = c("I21", "I22")
))
#> Warning: 'cods' is deprecated, use 'codes' instead.
# Derived variables
skeleton[, birth_year := as.numeric(substr(fodelseman, 1, 4))]
skeleton[, age := isoyear - birth_year]
skeleton[, any_mental_health := depression | anxiety | psychosis]
skeleton[, depression_treated := depression & antidepressants]
skeleton[, death_any := external_death | cardiovascular_death]
skeleton[, life_stage := fcase(
age < 18, "child",
age >= 18 & age < 65, "adult",
age >= 65, "elderly",
default = "unknown"
)]
skeleton[, c("fodelseman", "birth_year") := NULL]
# Filter
skeleton <- skeleton[age >= 0 & age <= 100 & isoyear >= 2015]
cat("Skeleton ready:", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
#> Skeleton ready: 315000 rows, 21 columnsWeekly vs yearly rows
The skeleton contains both weekly and yearly rows. The
is_isoyear column distinguishes them:
cat("Weekly rows:", sum(!skeleton$is_isoyear), "\n")
#> Weekly rows: 314000
cat("Yearly rows:", sum(skeleton$is_isoyear), "\n")
#> Yearly rows: 1000Weekly rows carry event-level precision (diagnosis this week, prescription active this week). Yearly rows carry annually-updated data (income, education, family type). Most analyses need the data collapsed to one granularity.
Collapsing weekly data to person-years
The most common aggregation: collapse weekly boolean indicators into yearly summaries answering “did anything happen this year?”
Use swereg::max_with_infinite_as_na() for
boolean/logical columns – it returns TRUE if any week was
TRUE, FALSE if all were FALSE,
and NA if all were NA:
person_years <- skeleton[, .(
age = swereg::first_non_na(age),
life_stage = swereg::first_non_na(life_stage),
# "Did this happen at any point during the year?"
depression = swereg::max_with_infinite_as_na(depression),
anxiety = swereg::max_with_infinite_as_na(anxiety),
psychosis = swereg::max_with_infinite_as_na(psychosis),
any_mental_health = swereg::max_with_infinite_as_na(any_mental_health),
gender_dysphoria = swereg::max_with_infinite_as_na(gender_dysphoria),
antidepressants = swereg::max_with_infinite_as_na(antidepressants),
antipsychotics = swereg::max_with_infinite_as_na(antipsychotics),
hormones = swereg::max_with_infinite_as_na(hormones),
depression_treated = swereg::max_with_infinite_as_na(depression_treated),
death_any = swereg::max_with_infinite_as_na(death_any)
), by = .(id, isoyear)]
cat("Person-year dataset:", nrow(person_years), "rows\n")
#> Person-year dataset: 6000 rows
head(person_years)
#> id isoyear age life_stage depression anxiety psychosis
#> <int> <int> <num> <char> <int> <int> <int>
#> 1: 1 2015 56 adult 0 0 0
#> 2: 1 2016 57 adult 0 0 0
#> 3: 1 2017 58 adult 0 0 0
#> 4: 1 2018 59 adult 0 0 0
#> 5: 1 2019 60 adult 0 0 0
#> 6: 1 2020 61 adult 0 0 0
#> any_mental_health gender_dysphoria antidepressants antipsychotics hormones
#> <int> <int> <int> <int> <int>
#> 1: 0 0 0 0 0
#> 2: 0 0 0 0 0
#> 3: 0 0 0 0 0
#> 4: 0 1 0 0 0
#> 5: 0 0 0 0 0
#> 6: 0 0 0 0 1
#> depression_treated death_any
#> <int> <int>
#> 1: 0 0
#> 2: 0 0
#> 3: 0 0
#> 4: 0 0
#> 5: 0 0
#> 6: 0 0Example analyses on the person-year dataset
Prevalence by year
prevalence <- person_years[, .(
n = .N,
depression_prev = mean(depression, na.rm = TRUE),
anxiety_prev = mean(anxiety, na.rm = TRUE),
treatment_rate = ifelse(
sum(depression, na.rm = TRUE) > 0,
mean(depression_treated[depression == TRUE], na.rm = TRUE),
NA_real_
)
), by = .(isoyear)]
print(prevalence[order(isoyear)])
#> isoyear n depression_prev anxiety_prev treatment_rate
#> <int> <int> <num> <num> <num>
#> 1: 2015 1000 0.009 0.008 0.0000000
#> 2: 2016 1000 0.007 0.005 0.0000000
#> 3: 2017 1000 0.007 0.008 0.0000000
#> 4: 2018 1000 0.012 0.006 0.0000000
#> 5: 2019 1000 0.007 0.006 0.1428571
#> 6: 2020 1000 0.004 0.010 0.0000000Prevalence by life stage
by_stage <- person_years[, .(
n = .N,
depression_prev = mean(depression, na.rm = TRUE),
antidepressant_use = mean(antidepressants, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE)
), by = .(life_stage)]
print(by_stage)
#> life_stage n depression_prev antidepressant_use mean_age
#> <char> <int> <num> <num> <num>
#> 1: adult 5007 0.007389654 0.07629319 40.90174
#> 2: elderly 353 0.011331445 0.09065156 66.67989
#> 3: child 640 0.007812500 0.07968750 14.44219Treatment patterns among those with any mental health condition
treatment <- person_years[any_mental_health == TRUE, .(
antidepressant_use = mean(antidepressants, na.rm = TRUE),
antipsychotic_use = mean(antipsychotics, na.rm = TRUE),
hormone_use = mean(hormones, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
n = .N
), by = .(isoyear)]
print(treatment[order(isoyear)])
#> isoyear antidepressant_use antipsychotic_use hormone_use mean_age n
#> <int> <num> <num> <num> <num> <int>
#> 1: 2015 0.04000000 0.04000000 0.1600000 31.68000 25
#> 2: 2016 0.09523810 0.04761905 0.1904762 39.19048 21
#> 3: 2017 0.16000000 0.04000000 0.1200000 41.76000 25
#> 4: 2018 0.04545455 0.00000000 0.3181818 39.22727 22
#> 5: 2019 0.05000000 0.20000000 0.1500000 42.90000 20
#> 6: 2020 0.00000000 0.05000000 0.2000000 40.35000 20Working directly with weekly data
Some analyses need weekly resolution – for example, time-to-event models or exposure-lag analyses. Use the weekly rows directly:
Creating a person-level baseline table
Collapse to one row per person for baseline characteristics:
baseline <- person_years[, .(
first_year = min(isoyear),
last_year = max(isoyear),
follow_up_years = .N,
ever_depression = any(depression, na.rm = TRUE),
ever_anxiety = any(anxiety, na.rm = TRUE),
ever_gd = any(gender_dysphoria, na.rm = TRUE),
died = any(death_any, na.rm = TRUE)
), by = .(id)]
cat("Baseline table:", nrow(baseline), "individuals\n")
#> Baseline table: 1000 individuals
print(head(baseline))
#> id first_year last_year follow_up_years ever_depression ever_anxiety
#> <int> <int> <int> <int> <lgcl> <lgcl>
#> 1: 1 2015 2020 6 FALSE FALSE
#> 2: 2 2015 2020 6 FALSE FALSE
#> 3: 3 2015 2020 6 FALSE FALSE
#> 4: 4 2015 2020 6 FALSE FALSE
#> 5: 5 2015 2020 6 FALSE FALSE
#> 6: 6 2015 2020 6 FALSE FALSE
#> ever_gd died
#> <lgcl> <lgcl>
#> 1: TRUE FALSE
#> 2: FALSE FALSE
#> 3: FALSE FALSE
#> 4: FALSE FALSE
#> 5: FALSE FALSE
#> 6: FALSE FALSESummary
The two-step manual workflow:
-
Create the skeleton
(
vignette("skeleton-create")): build the time grid, integrate registry data, derive variables - Analyse the skeleton (this vignette): collapse to the right granularity and run queries
For production-scale pipelines with incremental rebuilds, see
vignette("skeleton-pipeline").
