Batching (skeleton3_analyze) • swereg

library(data.table)
library(swereg)
# library(qs2)  # Optional: for faster file I/O with qd_save/qd_read

Introduction

This vignette demonstrates skeleton3_analyze - memory-efficient batching techniques for processing huge datasets with limited RAM.

Important note: This stage is only required if you have very large datasets (>100,000 individuals) and insufficient RAM. For most analyses, you can use the cleaned skeleton2_clean output directly for statistical modeling.

Prerequisites: Complete the “Cleaning and deriving variables (skeleton2_clean)” vignette first, as this stage builds on skeleton2_clean output.

What is skeleton3_analyze?

The skeleton3_analyze stage focuses on:

Final data aggregation: Weekly→yearly data collapsing as needed
Analysis dataset creation: Optimized for specific research questions
Memory-efficient processing: Batching strategies for large populations
Production workflows: Scalable approaches for hundreds of thousands of individuals

This stage creates the final analysis datasets ready for statistical modeling.

When to use batching strategies

Use batching strategies when:

Study population > 100,000 individuals
Memory constraints: Limited RAM for the full dataset
Processing time: Long-running operations that benefit from parallel processing
File size management: Breaking large datasets into manageable chunks

Setup: memory-efficient workflow

# Setup for batched processing
BATCH_SIZE <- 50  # Small for demonstration - use 1000-5000 for real studies
OUTPUT_DIR <- tempdir()  # Use temporary directory for vignette

# Setup for demonstration (see skeleton1_create vignette for detailed data integration)
# For batching, we organize data into large_data_files list for memory management
large_data_files <- list(
  "fake_demographics" = swereg::fake_demographics |>
    data.table::copy() |>
    swereg::make_lowercase_names(date_columns = "fodelseman"),
  "fake_annual_family" = swereg::fake_annual_family |>
    swereg::make_lowercase_names(),
  "fake_diagnoses" = swereg::fake_diagnoses |>
    data.table::copy() |>
    swereg::make_lowercase_names(date_columns = "indatum"),
  "fake_diagnoses" = swereg::fake_diagnoses |>
    data.table::copy() |>
    swereg::make_lowercase_names(date_columns = "indatum"),
  "fake_prescriptions" = swereg::fake_prescriptions |>
    data.table::copy() |>
    swereg::make_lowercase_names(date_columns = "edatum"),
  "fake_cod" = swereg::fake_cod |>
    data.table::copy() |>
    swereg::make_lowercase_names(date_columns = "dodsdat")
)

cat("Data loaded and preprocessed - ready for batched processing\n")

The three-stage batched workflow

skeleton1_create batches: Process raw data integration in chunks, save skeleton1 files
skeleton2_clean batches: Load skeleton1 files, clean data, save skeleton2 files
skeleton3_analyze: Combine skeleton2 files into final analysis dataset

Phase 1: skeleton1_create (batched data integration)

Create batch processing function

skeleton1_create_batch <- function(batch_ids, batch_number, large_data_files) {
  # Declare variables for data.table non-standard evaluation
  lopnr <- p444_lopnr_personnr <- NULL
  
  cat("Processing batch", batch_number, "with", length(batch_ids), "individuals\n")
  
  # Create skeleton for this batch
  skeleton <- swereg::create_skeleton(
    ids = batch_ids,
    date_min = "2015-01-01",
    date_max = "2018-12-31"  # Shorter period for demonstration
  )
  
  # Add demographics
  demographics_subset <- large_data_files[["fake_demographics"]][lopnr %in% batch_ids]
  if (nrow(demographics_subset) > 0) {
    swereg::add_onetime(skeleton, demographics_subset, id_name = "lopnr")
  }
  
  # Add annual data
  annual_subset <- large_data_files[["fake_annual_family"]][lopnr %in% batch_ids]
  if (nrow(annual_subset) > 0) {
    swereg::add_annual(skeleton, annual_subset, id_name = "lopnr", isoyear = 2015)
  }
  
  # Add diagnoses
  diagnoses_subset <- large_data_files[["fake_diagnoses"]][lopnr %in% batch_ids]

  if (nrow(diagnoses_subset) > 0) {
    swereg::add_diagnoses(
      skeleton,
      diagnoses_subset,
      id_name = "lopnr",
      diags = list(
        "depression" = c("F32", "F33"),
        "anxiety" = c("F40", "F41"),
        "gender_dysphoria" = c("F64"),
        "psychosis" = c("F20", "F25")
      )
    )
  }
  
  # Add prescriptions
  prescriptions_subset <- large_data_files[["fake_prescriptions"]][p444_lopnr_personnr %in% batch_ids]
  if (nrow(prescriptions_subset) > 0) {
    swereg::add_rx(
      skeleton,
      prescriptions_subset,
      id_name = "p444_lopnr_personnr",
      rxs = list(
        "antidepressants" = c("N06A"),
        "antipsychotics" = c("N05A"),
        "hormones" = c("G03")
      )
    )
  }
  
  # Add cause of death
  cod_subset <- large_data_files[["fake_cod"]][lopnr %in% batch_ids]
  if (nrow(cod_subset) > 0) {
    swereg::add_cods(
      skeleton,
      cod_subset,
      id_name = "lopnr",
      cods = list(
        "external_death" = c("X60", "X70"),
        "cardiovascular_death" = c("I21", "I22")
      )
    )
  }
  
  # Save batch using qs (much faster than RDS)
  output_file <- file.path(OUTPUT_DIR, paste0("skeleton1_create_", batch_number, ".qs2"))
  qs2::qs_save(skeleton, output_file)
  
  cat("Saved skeleton1_create", batch_number, ":", nrow(skeleton), "rows\n")
  return(output_file)
}

Process skeleton1_create in batches

# Process first 100 individuals in 2 batches
data("fake_person_ids", package = "swereg")
ids_subset <- fake_person_ids[1:100]
id_batches <- csutil::easy_split(ids_subset, BATCH_SIZE)

skeleton1_files <- vector("character", length(id_batches))
for (i in seq_along(id_batches)) {
  skeleton1_files[i] <- skeleton1_create_batch(id_batches[[i]], i, large_data_files)
}

cat("skeleton1_create phase completed for", length(id_batches), "batches\n")

# CRITICAL: Remove large datasets from memory
# This is the key benefit of organizing data into large_data_files - 
# easy cleanup of "the big lump of data" after skeleton1_create is complete
rm(large_data_files)
gc()  # Force garbage collection

cat("Large datasets removed from memory\n")

Phase 2: skeleton2_clean (batched data cleaning)

Create cleaning function

skeleton2_clean_batch <- function(batch_number) {
  cat("Cleaning batch", batch_number, "\n")
  
  # Load skeleton1 for this batch
  input_file <- file.path(OUTPUT_DIR, paste0("skeleton1_create_", batch_number, ".qs2"))
  skeleton <- qs2::qs_read(input_file)
  
  # CLEANING OPERATIONS (using only data within skeleton)
  
  # 1. Create age variable
  skeleton[, birth_year := as.numeric(substr(fodelseman, 1, 4))]
  skeleton[, age := isoyear - birth_year]
  
  # 2. Create mental health composite variables
  skeleton[, any_mental_health := depression | anxiety | psychosis]
  skeleton[, severe_mental_illness := psychosis | gender_dysphoria]
  
  # 3. Create medication concordance variables
  skeleton[, depression_treated := depression & antidepressants]
  skeleton[, psychosis_treated := psychosis & antipsychotics]
  
  # 4. Create life stage variables
  skeleton[, life_stage := fcase(
    age < 18, "child",
    age >= 18 & age < 65, "adult", 
    age >= 65, "elderly",
    default = "unknown"
  )]
  
  # 5. Create outcome variables (handle missing death columns gracefully)
  if (all(c("external_death", "cardiovascular_death") %in% names(skeleton))) {
    skeleton[, death_any := external_death | cardiovascular_death]
  } else {
    skeleton[, death_any := FALSE]
  }
  
  # 6. Filter to valid ages and reasonable time periods
  skeleton <- skeleton[age >= 0 & age <= 100]
  skeleton <- skeleton[isoyear >= 2015]  # Remove historical rows
  
  # 7. Create person-level summaries for annual data
  if (skeleton[, any(is_isoyear == TRUE)]) {
    skeleton[is_isoyear == TRUE, n_mental_health_year := sum(c(depression, anxiety, psychosis), na.rm = TRUE), by = .(id, isoyear)]
    skeleton[is_isoyear == TRUE, treatment_adherence := mean(c(depression_treated, psychosis_treated), na.rm = TRUE), by = .(id, isoyear)]
  }
  
  # 8. Create registry tag variables (simulate case-control study)
  skeleton[, register_tag := fcase(
    gender_dysphoria == TRUE, "case",
    id %% 3 == 0, "control_matched",
    default = "control_population"
  )]
  
  # 9. Create shared case variables (for matched studies)
  # Find first gender dysphoria diagnosis for cases
  gd_first <- skeleton[gender_dysphoria == TRUE & register_tag == "case", 
                       .(first_gd_year = min(isoyear, na.rm = TRUE)), 
                       by = .(id)]
  
  # Add to skeleton
  skeleton[gd_first, on = "id", first_gd_year := first_gd_year]
  
  # For controls, assign their matched case's first GD year (simplified)
  skeleton[register_tag != "case", first_gd_year := 2016]  # Simplified for demo
  
  # 10. Remove temporary variables
  skeleton[, c("fodelseman", "birth_year") := NULL]
  
  # Save cleaned skeleton using qs
  output_file <- file.path(OUTPUT_DIR, paste0("skeleton2_clean_", batch_number, ".qs2"))
  qs2::qs_save(skeleton, output_file)
  
  cat("Cleaned skeleton2_clean", batch_number, ":", nrow(skeleton), "rows,", ncol(skeleton), "columns\n")
  return(output_file)
}

Process skeleton2_clean in batches

# Process all batches for skeleton2_clean
skeleton2_files <- vector("character", length(id_batches))
for (i in seq_along(id_batches)) {
  skeleton2_files[i] <- skeleton2_clean_batch(i)
}

cat("skeleton2_clean phase completed\n")

Phase 3: skeleton3_analyze (final analysis dataset)

Create analysis dataset from all batches

This is where batching becomes essential - skeleton3 reduces the data to only what’s needed for analysis, dramatically reducing memory usage.

Key concept: weekly→yearly data aggregation

The skeleton contains both weekly and yearly rows. In skeleton3_analyze, we collapse weekly data to yearly data using swereg::max_with_infinite_as_na() to answer the question: “Did anything happen this year?”

For diagnoses: Did the person have depression at any point during the year?
For treatments: Did the person receive antidepressants at any point during the year?
For events: Did any relevant event occur during the year?

This aggregation creates person-year level data suitable for epidemiological analysis.

skeleton3_analyze <- function(skeleton2_files) {
  cat("Creating analysis dataset from", length(skeleton2_files), "batches\n")
  
  # Load all cleaned batches
  all_batches <- vector("list", length(skeleton2_files))
  for (i in seq_along(skeleton2_files)) {
    skeleton <- qs2::qs_read(skeleton2_files[i])
    
    # Extract analysis variables (collapse weekly data to yearly)
    # Use max_with_infinite_as_na because we're aggregating weekly data to yearly:
    # "did anything happen this year?"
    analysis_data <- skeleton[
      .(
        # Demographic variables
        age = swereg::first_non_na(age),
        life_stage = swereg::first_non_na(life_stage),
        
        # Outcome variables - use max to detect "did anything happen this year?"
        any_mental_health = swereg::max_with_infinite_as_na(any_mental_health),
        severe_mental_illness = swereg::max_with_infinite_as_na(severe_mental_illness),
        depression = swereg::max_with_infinite_as_na(depression),
        anxiety = swereg::max_with_infinite_as_na(anxiety),
        psychosis = swereg::max_with_infinite_as_na(psychosis),
        gender_dysphoria = swereg::max_with_infinite_as_na(gender_dysphoria),
        
        # Treatment variables
        antidepressants = swereg::max_with_infinite_as_na(antidepressants),
        antipsychotics = swereg::max_with_infinite_as_na(antipsychotics),
        hormones = swereg::max_with_infinite_as_na(hormones),
        
        # Derived variables
        depression_treated = swereg::max_with_infinite_as_na(depression_treated),
        psychosis_treated = swereg::max_with_infinite_as_na(psychosis_treated),
        
        # Mortality
        death_any = swereg::max_with_infinite_as_na(death_any),
        
        # Study design variables
        first_gd_year = swereg::first_non_na(first_gd_year),
        
        # Summary variables
        n_mental_health_year = swereg::first_non_na(n_mental_health_year),
        treatment_adherence = swereg::first_non_na(treatment_adherence)
      ),
      by = .(id, study_year = isoyear, register_tag)
    ]
    
    all_batches[[i]] <- analysis_data
  }
  
  # Combine all batches
  final_analysis <- rbindlist(all_batches, fill = TRUE)
  
  # Save final analysis dataset
  output_file <- file.path(OUTPUT_DIR, "skeleton3_analyze.qs2")
  qs2::qs_save(final_analysis, output_file)
  
  cat("Saved skeleton3_analyze:", nrow(final_analysis), "person-years\n")
  
  return(final_analysis)
}

# Create final analysis dataset
analysis_data <- skeleton3_analyze(skeleton2_files)

cat("Analysis dataset created:", nrow(analysis_data), "person-years\n")
cat("Variables:", ncol(analysis_data), "\n")
cat("Study population breakdown:\n")
print(table(analysis_data$register_tag))

Analysis dataset summary

The final skeleton3_analyze contains analysis-ready data:

# Show structure
str(analysis_data)

# Example analysis: Depression prevalence by register tag
depression_summary <- analysis_data[, .(
  n_person_years = .N,
  depression_prev = mean(depression, na.rm = TRUE),
  # Fix treatment rate calculation to avoid NaN
  treatment_rate = ifelse(sum(depression, na.rm = TRUE) > 0, 
                         mean(depression_treated[depression == TRUE], na.rm = TRUE), 
                         NA_real_)
), by = .(register_tag)]

print(depression_summary)

# Example: Mental health treatment patterns
treatment_summary <- analysis_data[any_mental_health == TRUE, .(
  antidepressant_use = mean(antidepressants, na.rm = TRUE),
  antipsychotic_use = mean(antipsychotics, na.rm = TRUE),
  hormone_use = mean(hormones, na.rm = TRUE),
  mean_age = mean(age, na.rm = TRUE)
), by = register_tag]

print(treatment_summary)

Memory management best practices

1. Batch size optimization

# For production studies, batch size depends on:
# - Available RAM: Larger batches use more memory but fewer file operations
# - Processing time: Very large batches can hit memory limits
# - File system: Too many small files can slow down I/O

# Recommended batch sizes:
# - 1,000-2,000 individuals for modest hardware (8-16GB RAM)
# - 5,000-10,000 individuals for high-memory systems (32-64GB RAM)
# - Each batch uses ~200-500MB RAM during processing

2. File organization

# For production studies, organize files systematically:
# OUTPUT_DIR/
#   skeleton1/
#     skeleton1_create_1.qs2, skeleton1_create_2.qs2, ...
#   skeleton2/
#     skeleton2_clean_1.qs2, skeleton2_clean_2.qs2, ...
#   skeleton3/
#     skeleton3_analyze.qs2

# Clean up strategy:
# 1. Keep skeleton2 files for quality checks
# 2. Remove skeleton1 files after skeleton2_clean succeeds
# 3. Archive skeleton2 files after skeleton3_analyze succeeds

3. Error handling

# Production workflows should include error handling
skeleton1_create_batch_safe <- function(batch_ids, batch_number, large_data_files) {
  tryCatch({
    return(skeleton1_create_batch(batch_ids, batch_number, large_data_files))
  }, error = function(e) {
    cat("ERROR in batch", batch_number, ":", e$message, "\n")
    return(NULL)
  })
}

# Check for failed batches before proceeding
# failed_batches <- which(sapply(skeleton1_files, is.null))
# This allows resuming failed batch processing

Key batching principles

Memory efficiency

Sequential processing: Process one batch at a time
Memory cleanup: Remove large datasets after skeleton1_create
Garbage collection: Use gc() between batches
File-based workflow: Save/load batches to disk

Data integrity

Batch validation: Check that all individuals are processed
File verification: Ensure all batch files exist before skeleton3
Data consistency: Verify variables across batches

Summary: the complete skeleton1→skeleton2→skeleton3 workflow

This pipeline demonstrates production-scale processing:

skeleton1_create: Built time-structured framework and integrated raw registry data in batches
skeleton2_clean: Cleaned variables and created derived clinical indicators in batches
skeleton3_analyze: Combined all batches into final analysis dataset with memory-efficient aggregation

Key benefits: - Scalable: Handles hundreds of thousands of individuals - Memory-efficient: Uses large_data_files pattern and systematic cleanup - Robust: File-based workflow survives interruptions - Analysis-ready: Final dataset optimized for statistical modeling

The skeleton3_analyze output is now ready for epidemiological analysis, regression modeling, or survival analysis.