library(data.table)
library(swereg)
# library(qs2) # Optional: for faster file I/O with qd_save/qd_readIntroduction
This vignette demonstrates skeleton3_analyze - memory-efficient batching techniques for processing huge datasets with limited RAM.
Important note: This stage is only required if you have very large datasets (>100,000 individuals) and insufficient RAM. For most analyses, you can use the cleaned skeleton2_clean output directly for statistical modeling.
Prerequisites: Complete the “Cleaning and deriving variables (skeleton2_clean)” vignette first, as this stage builds on skeleton2_clean output.
What is skeleton3_analyze?
The skeleton3_analyze stage focuses on:
- Final data aggregation: Weekly→yearly data collapsing as needed
- Analysis dataset creation: Optimized for specific research questions
- Memory-efficient processing: Batching strategies for large populations
- Production workflows: Scalable approaches for hundreds of thousands of individuals
This stage creates the final analysis datasets ready for statistical modeling.
When to use batching strategies
Use batching strategies when:
- Study population > 100,000 individuals
- Memory constraints: Limited RAM for the full dataset
- Processing time: Long-running operations that benefit from parallel processing
- File size management: Breaking large datasets into manageable chunks
Setup: memory-efficient workflow
# Setup for batched processing
BATCH_SIZE <- 50 # Small for demonstration - use 1000-5000 for real studies
OUTPUT_DIR <- tempdir() # Use temporary directory for vignette
# Setup for demonstration (see skeleton1_create vignette for detailed data integration)
# For batching, we organize data into large_data_files list for memory management
large_data_files <- list(
"fake_demographics" = swereg::fake_demographics |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "fodelseman"),
"fake_annual_family" = swereg::fake_annual_family |>
swereg::make_lowercase_names(),
"fake_diagnoses" = swereg::fake_diagnoses |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "indatum"),
"fake_diagnoses" = swereg::fake_diagnoses |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "indatum"),
"fake_prescriptions" = swereg::fake_prescriptions |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "edatum"),
"fake_cod" = swereg::fake_cod |>
data.table::copy() |>
swereg::make_lowercase_names(date_columns = "dodsdat")
)
cat("Data loaded and preprocessed - ready for batched processing\n")The three-stage batched workflow
- skeleton1_create batches: Process raw data integration in chunks, save skeleton1 files
-
skeleton2_clean batches: Load skeleton1 files,
clean data, save skeleton2 files
- skeleton3_analyze: Combine skeleton2 files into final analysis dataset
Phase 1: skeleton1_create (batched data integration)
Create batch processing function
skeleton1_create_batch <- function(batch_ids, batch_number, large_data_files) {
# Declare variables for data.table non-standard evaluation
lopnr <- p444_lopnr_personnr <- NULL
cat("Processing batch", batch_number, "with", length(batch_ids), "individuals\n")
# Create skeleton for this batch
skeleton <- swereg::create_skeleton(
ids = batch_ids,
date_min = "2015-01-01",
date_max = "2018-12-31" # Shorter period for demonstration
)
# Add demographics
demographics_subset <- large_data_files[["fake_demographics"]][lopnr %in% batch_ids]
if (nrow(demographics_subset) > 0) {
swereg::add_onetime(skeleton, demographics_subset, id_name = "lopnr")
}
# Add annual data
annual_subset <- large_data_files[["fake_annual_family"]][lopnr %in% batch_ids]
if (nrow(annual_subset) > 0) {
swereg::add_annual(skeleton, annual_subset, id_name = "lopnr", isoyear = 2015)
}
# Add diagnoses
diagnoses_subset <- large_data_files[["fake_diagnoses"]][lopnr %in% batch_ids]
if (nrow(diagnoses_subset) > 0) {
swereg::add_diagnoses(
skeleton,
diagnoses_subset,
id_name = "lopnr",
diags = list(
"depression" = c("F32", "F33"),
"anxiety" = c("F40", "F41"),
"gender_dysphoria" = c("F64"),
"psychosis" = c("F20", "F25")
)
)
}
# Add prescriptions
prescriptions_subset <- large_data_files[["fake_prescriptions"]][p444_lopnr_personnr %in% batch_ids]
if (nrow(prescriptions_subset) > 0) {
swereg::add_rx(
skeleton,
prescriptions_subset,
id_name = "p444_lopnr_personnr",
rxs = list(
"antidepressants" = c("N06A"),
"antipsychotics" = c("N05A"),
"hormones" = c("G03")
)
)
}
# Add cause of death
cod_subset <- large_data_files[["fake_cod"]][lopnr %in% batch_ids]
if (nrow(cod_subset) > 0) {
swereg::add_cods(
skeleton,
cod_subset,
id_name = "lopnr",
cods = list(
"external_death" = c("X60", "X70"),
"cardiovascular_death" = c("I21", "I22")
)
)
}
# Save batch using qs (much faster than RDS)
output_file <- file.path(OUTPUT_DIR, paste0("skeleton1_create_", batch_number, ".qs2"))
qs2::qs_save(skeleton, output_file)
cat("Saved skeleton1_create", batch_number, ":", nrow(skeleton), "rows\n")
return(output_file)
}Process skeleton1_create in batches
# Process first 100 individuals in 2 batches
data("fake_person_ids", package = "swereg")
ids_subset <- fake_person_ids[1:100]
id_batches <- csutil::easy_split(ids_subset, BATCH_SIZE)
skeleton1_files <- vector("character", length(id_batches))
for (i in seq_along(id_batches)) {
skeleton1_files[i] <- skeleton1_create_batch(id_batches[[i]], i, large_data_files)
}
cat("skeleton1_create phase completed for", length(id_batches), "batches\n")
# CRITICAL: Remove large datasets from memory
# This is the key benefit of organizing data into large_data_files -
# easy cleanup of "the big lump of data" after skeleton1_create is complete
rm(large_data_files)
gc() # Force garbage collection
cat("Large datasets removed from memory\n")Phase 2: skeleton2_clean (batched data cleaning)
Create cleaning function
skeleton2_clean_batch <- function(batch_number) {
cat("Cleaning batch", batch_number, "\n")
# Load skeleton1 for this batch
input_file <- file.path(OUTPUT_DIR, paste0("skeleton1_create_", batch_number, ".qs2"))
skeleton <- qs2::qs_read(input_file)
# CLEANING OPERATIONS (using only data within skeleton)
# 1. Create age variable
skeleton[, birth_year := as.numeric(substr(fodelseman, 1, 4))]
skeleton[, age := isoyear - birth_year]
# 2. Create mental health composite variables
skeleton[, any_mental_health := depression | anxiety | psychosis]
skeleton[, severe_mental_illness := psychosis | gender_dysphoria]
# 3. Create medication concordance variables
skeleton[, depression_treated := depression & antidepressants]
skeleton[, psychosis_treated := psychosis & antipsychotics]
# 4. Create life stage variables
skeleton[, life_stage := fcase(
age < 18, "child",
age >= 18 & age < 65, "adult",
age >= 65, "elderly",
default = "unknown"
)]
# 5. Create outcome variables (handle missing death columns gracefully)
if (all(c("external_death", "cardiovascular_death") %in% names(skeleton))) {
skeleton[, death_any := external_death | cardiovascular_death]
} else {
skeleton[, death_any := FALSE]
}
# 6. Filter to valid ages and reasonable time periods
skeleton <- skeleton[age >= 0 & age <= 100]
skeleton <- skeleton[isoyear >= 2015] # Remove historical rows
# 7. Create person-level summaries for annual data
if (skeleton[, any(is_isoyear == TRUE)]) {
skeleton[is_isoyear == TRUE, n_mental_health_year := sum(c(depression, anxiety, psychosis), na.rm = TRUE), by = .(id, isoyear)]
skeleton[is_isoyear == TRUE, treatment_adherence := mean(c(depression_treated, psychosis_treated), na.rm = TRUE), by = .(id, isoyear)]
}
# 8. Create registry tag variables (simulate case-control study)
skeleton[, register_tag := fcase(
gender_dysphoria == TRUE, "case",
id %% 3 == 0, "control_matched",
default = "control_population"
)]
# 9. Create shared case variables (for matched studies)
# Find first gender dysphoria diagnosis for cases
gd_first <- skeleton[gender_dysphoria == TRUE & register_tag == "case",
.(first_gd_year = min(isoyear, na.rm = TRUE)),
by = .(id)]
# Add to skeleton
skeleton[gd_first, on = "id", first_gd_year := first_gd_year]
# For controls, assign their matched case's first GD year (simplified)
skeleton[register_tag != "case", first_gd_year := 2016] # Simplified for demo
# 10. Remove temporary variables
skeleton[, c("fodelseman", "birth_year") := NULL]
# Save cleaned skeleton using qs
output_file <- file.path(OUTPUT_DIR, paste0("skeleton2_clean_", batch_number, ".qs2"))
qs2::qs_save(skeleton, output_file)
cat("Cleaned skeleton2_clean", batch_number, ":", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
return(output_file)
}Phase 3: skeleton3_analyze (final analysis dataset)
Create analysis dataset from all batches
This is where batching becomes essential - skeleton3 reduces the data to only what’s needed for analysis, dramatically reducing memory usage.
Key concept: weekly→yearly data aggregation
The skeleton contains both weekly and yearly rows. In
skeleton3_analyze, we collapse weekly data to yearly data using
swereg::max_with_infinite_as_na() to answer the question:
“Did anything happen this year?”
- For diagnoses: Did the person have depression at any point during the year?
- For treatments: Did the person receive antidepressants at any point during the year?
- For events: Did any relevant event occur during the year?
This aggregation creates person-year level data suitable for epidemiological analysis.
skeleton3_analyze <- function(skeleton2_files) {
cat("Creating analysis dataset from", length(skeleton2_files), "batches\n")
# Load all cleaned batches
all_batches <- vector("list", length(skeleton2_files))
for (i in seq_along(skeleton2_files)) {
skeleton <- qs2::qs_read(skeleton2_files[i])
# Extract analysis variables (collapse weekly data to yearly)
# Use max_with_infinite_as_na because we're aggregating weekly data to yearly:
# "did anything happen this year?"
analysis_data <- skeleton[
.(
# Demographic variables
age = swereg::first_non_na(age),
life_stage = swereg::first_non_na(life_stage),
# Outcome variables - use max to detect "did anything happen this year?"
any_mental_health = swereg::max_with_infinite_as_na(any_mental_health),
severe_mental_illness = swereg::max_with_infinite_as_na(severe_mental_illness),
depression = swereg::max_with_infinite_as_na(depression),
anxiety = swereg::max_with_infinite_as_na(anxiety),
psychosis = swereg::max_with_infinite_as_na(psychosis),
gender_dysphoria = swereg::max_with_infinite_as_na(gender_dysphoria),
# Treatment variables
antidepressants = swereg::max_with_infinite_as_na(antidepressants),
antipsychotics = swereg::max_with_infinite_as_na(antipsychotics),
hormones = swereg::max_with_infinite_as_na(hormones),
# Derived variables
depression_treated = swereg::max_with_infinite_as_na(depression_treated),
psychosis_treated = swereg::max_with_infinite_as_na(psychosis_treated),
# Mortality
death_any = swereg::max_with_infinite_as_na(death_any),
# Study design variables
first_gd_year = swereg::first_non_na(first_gd_year),
# Summary variables
n_mental_health_year = swereg::first_non_na(n_mental_health_year),
treatment_adherence = swereg::first_non_na(treatment_adherence)
),
by = .(id, study_year = isoyear, register_tag)
]
all_batches[[i]] <- analysis_data
}
# Combine all batches
final_analysis <- rbindlist(all_batches, fill = TRUE)
# Save final analysis dataset
output_file <- file.path(OUTPUT_DIR, "skeleton3_analyze.qs2")
qs2::qs_save(final_analysis, output_file)
cat("Saved skeleton3_analyze:", nrow(final_analysis), "person-years\n")
return(final_analysis)
}
# Create final analysis dataset
analysis_data <- skeleton3_analyze(skeleton2_files)
cat("Analysis dataset created:", nrow(analysis_data), "person-years\n")
cat("Variables:", ncol(analysis_data), "\n")
cat("Study population breakdown:\n")
print(table(analysis_data$register_tag))Analysis dataset summary
The final skeleton3_analyze contains analysis-ready data:
# Show structure
str(analysis_data)
# Example analysis: Depression prevalence by register tag
depression_summary <- analysis_data[, .(
n_person_years = .N,
depression_prev = mean(depression, na.rm = TRUE),
# Fix treatment rate calculation to avoid NaN
treatment_rate = ifelse(sum(depression, na.rm = TRUE) > 0,
mean(depression_treated[depression == TRUE], na.rm = TRUE),
NA_real_)
), by = .(register_tag)]
print(depression_summary)
# Example: Mental health treatment patterns
treatment_summary <- analysis_data[any_mental_health == TRUE, .(
antidepressant_use = mean(antidepressants, na.rm = TRUE),
antipsychotic_use = mean(antipsychotics, na.rm = TRUE),
hormone_use = mean(hormones, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE)
), by = register_tag]
print(treatment_summary)Memory management best practices
1. Batch size optimization
# For production studies, batch size depends on:
# - Available RAM: Larger batches use more memory but fewer file operations
# - Processing time: Very large batches can hit memory limits
# - File system: Too many small files can slow down I/O
# Recommended batch sizes:
# - 1,000-2,000 individuals for modest hardware (8-16GB RAM)
# - 5,000-10,000 individuals for high-memory systems (32-64GB RAM)
# - Each batch uses ~200-500MB RAM during processing2. File organization
# For production studies, organize files systematically:
# OUTPUT_DIR/
# skeleton1/
# skeleton1_create_1.qs2, skeleton1_create_2.qs2, ...
# skeleton2/
# skeleton2_clean_1.qs2, skeleton2_clean_2.qs2, ...
# skeleton3/
# skeleton3_analyze.qs2
# Clean up strategy:
# 1. Keep skeleton2 files for quality checks
# 2. Remove skeleton1 files after skeleton2_clean succeeds
# 3. Archive skeleton2 files after skeleton3_analyze succeeds3. Error handling
# Production workflows should include error handling
skeleton1_create_batch_safe <- function(batch_ids, batch_number, large_data_files) {
tryCatch({
return(skeleton1_create_batch(batch_ids, batch_number, large_data_files))
}, error = function(e) {
cat("ERROR in batch", batch_number, ":", e$message, "\n")
return(NULL)
})
}
# Check for failed batches before proceeding
# failed_batches <- which(sapply(skeleton1_files, is.null))
# This allows resuming failed batch processingKey batching principles
Memory efficiency
- Sequential processing: Process one batch at a time
- Memory cleanup: Remove large datasets after skeleton1_create
-
Garbage collection: Use
gc()between batches - File-based workflow: Save/load batches to disk
Summary: the complete skeleton1→skeleton2→skeleton3 workflow
This pipeline demonstrates production-scale processing:
- skeleton1_create: Built time-structured framework and integrated raw registry data in batches
- skeleton2_clean: Cleaned variables and created derived clinical indicators in batches
- skeleton3_analyze: Combined all batches into final analysis dataset with memory-efficient aggregation
Key benefits: - Scalable: Handles
hundreds of thousands of individuals -
Memory-efficient: Uses large_data_files
pattern and systematic cleanup - Robust: File-based
workflow survives interruptions - Analysis-ready: Final
dataset optimized for statistical modeling
The skeleton3_analyze output is now ready for epidemiological analysis, regression modeling, or survival analysis.
