## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----message=FALSE, warning=FALSE---------------------------------------------
library(gsDesign)
library(gsDesignNB)
library(data.table)
library(ggplot2)
library(gt)

## ----sample-size--------------------------------------------------------------
# Sample size calculation
# Enrollment: constant rate over 12 months
# Trial duration: 24 months
event_gap_val <- 20 / 30.4375 # Minimum gap between events is 20 days (approx)

nb_ss <- sample_size_nbinom(
  lambda1 = 1.5 / 12, # Control event rate (per month)
  lambda2 = 1.0 / 12, # Experimental event rate (per month)
  dispersion = 0.5, # Overdispersion parameter
  power = 0.9, # 90% power
  alpha = 0.025, # One-sided alpha
  accrual_rate = 1, # This will be scaled to achieve the target power
  accrual_duration = 12, # 12 months enrollment
  trial_duration = 24, # 24 months trial
  max_followup = 12, # 12 months of follow-up per patient
  dropout_rate = -log(0.95) / 12, # 5% dropout rate at 1 year
  event_gap = event_gap_val
)

# Print key results
message("Fixed design")
nb_ss

## ----gs-design----------------------------------------------------------------
# Analysis times (in months)
analysis_times <- c(10, 18, 24)

# Create group sequential design with integer sample sizes
gs_nb <- gsNBCalendar(
  x = nb_ss, # Input fixed design for negative binomial
  k = 3, # 3 analyses
  test.type = 4, # Two-sided asymmetric, non-binding futility
  sfu = sfLinear, # Linear spending function for upper bound
  sfupar = c(.5, .5), # Identity function
  sfl = sfHSD, # HSD spending for lower bound
  sflpar = -8, # Conservative futility bound
  usTime = c(.1, .18, 1), # Upper spending timing
  lsTime = NULL, # Spending based on information
  analysis_times = analysis_times # Calendar times in months
) |> gsDesignNB::toInteger() # Round to integer sample size

## -----------------------------------------------------------------------------
summary(gs_nb)

## -----------------------------------------------------------------------------
gs_nb |>
  gsDesign::gsBoundSummary(
    deltaname = "RR",
    logdelta = TRUE,
    Nname = "Information",
    timename = "Month",
    digits = 4,
    ddigits = 2
  ) |>
  gt() |>
  tab_header(
    title = "Group Sequential Design Bounds for Negative Binomial Outcome",
    subtitle = paste0(
      "N = ", ceiling(gs_nb$n_total[gs_nb$k]),
      ", Expected events = ", round(gs_nb$nb_design$total_events, 1)
    )
  )

## ----load-results-------------------------------------------------------------
# Load pre-computed simulation results
results_file <- system.file("extdata", "gs_simulation_results.rds", package = "gsDesignNB")

if (results_file == "" && file.exists("../inst/extdata/gs_simulation_results.rds")) {
  results_file <- "../inst/extdata/gs_simulation_results.rds"
}

if (results_file != "") {
  sim_data <- readRDS(results_file)
  results <- sim_data$results
  summary_gs <- sim_data$summary_gs
  n_sims <- sim_data$n_sims
  params <- sim_data$params
} else {
  # Fallback if data is not available (e.g. not installed yet)
  warning("Simulation results not found. Skipping simulation analysis.")
  results <- NULL
  summary_gs <- NULL
  n_sims <- 0
}

## ----summary-table, eval=!is.null(results), results='asis'--------------------
# Helper function for trimmed mean (to handle outliers in blinded info)
trimmed_mean <- function(x, trim = 0.01) {
  x <- x[is.finite(x) & !is.na(x)]
  if (length(x) == 0) return(NA_real_)
  mean(x, trim = trim)
}

# Create comprehensive theoretical vs simulation comparison table for each analysis
dt <- data.table::as.data.table(results)

# Function to compute comparison for a specific analysis
get_analysis_comparison <- function(analysis_num) {
  sub_dt <- dt[analysis == analysis_num]
  
  # Get theoretical values from gs_nb design
  theo_n <- gs_nb$n_total[analysis_num]
  theo_n1 <- gs_nb$n1[analysis_num]
  theo_n2 <- gs_nb$n2[analysis_num]
  theo_exposure <- gs_nb$exposure[analysis_num]
  theo_exposure_at_risk1 <- gs_nb$exposure_at_risk1[analysis_num]
  theo_exposure_at_risk2 <- gs_nb$exposure_at_risk2[analysis_num]
  theo_events1 <- gs_nb$events1[analysis_num]
  theo_events2 <- gs_nb$events2[analysis_num]
  theo_events <- gs_nb$events[analysis_num]
  theo_variance <- gs_nb$variance[analysis_num]
  theo_info <- gs_nb$n.I[analysis_num]
  
  # Observed values (using trimmed means for robustness)
  obs_n <- mean(sub_dt$n_enrolled)
  obs_n1 <- mean(sub_dt$n_ctrl)
  obs_n2 <- mean(sub_dt$n_exp)
  obs_exposure1 <- mean(sub_dt$exposure_total_ctrl)
  obs_exposure2 <- mean(sub_dt$exposure_total_exp)
  obs_exposure_at_risk1 <- mean(sub_dt$exposure_at_risk_ctrl)
  obs_exposure_at_risk2 <- mean(sub_dt$exposure_at_risk_exp)
  obs_events1 <- mean(sub_dt$events_ctrl)
  obs_events2 <- mean(sub_dt$events_exp)
  obs_events <- mean(sub_dt$events_total)
  obs_variance <- median(sub_dt$se^2, na.rm = TRUE)
  obs_info_blinded <- trimmed_mean(sub_dt$blinded_info, trim = 0.01)
  obs_info_unblinded <- trimmed_mean(sub_dt$unblinded_info, trim = 0.01)
  
  # Build comparison data frame
  data.frame(
    Metric = c(
      "N Enrolled",
      "N Control",
      "N Experimental",
      "Total Exposure - Control",
      "Total Exposure - Experimental",
      "Exposure at Risk - Control",
      "Exposure at Risk - Experimental",
      "Events - Control",
      "Events - Experimental",
      "Events - Total",
      "Variance of log(RR)",
      "Information (Blinded)",
      "Information (Unblinded)"
    ),
    Theoretical = c(
      theo_n, theo_n1, theo_n2,
      theo_n1 * theo_exposure, theo_n2 * theo_exposure,
      theo_n1 * theo_exposure_at_risk1, theo_n2 * theo_exposure_at_risk2,
      theo_events1, theo_events2, theo_events,
      theo_variance, theo_info, theo_info
    ),
    Simulated = c(
      obs_n, obs_n1, obs_n2,
      obs_exposure1, obs_exposure2,
      obs_exposure_at_risk1, obs_exposure_at_risk2,
      obs_events1, obs_events2, obs_events,
      obs_variance, obs_info_blinded, obs_info_unblinded
    ),
    stringsAsFactors = FALSE
  )
}

# Generate comparison table for each analysis
for (k in 1:3) {
  cat(sprintf("\n### Analysis %d (Month %d)\n\n", k, params$analysis_times[k]))
  
  comparison_k <- get_analysis_comparison(k)
  comparison_k$Difference <- comparison_k$Simulated - comparison_k$Theoretical
  comparison_k$Rel_Diff_Pct <- 100 * comparison_k$Difference / abs(comparison_k$Theoretical)
  
  print(
    comparison_k |>
      gt() |>
      tab_header(
        title = sprintf("Analysis %d: Theoretical vs Simulated", k),
        subtitle = sprintf("Calendar time = %d months", params$analysis_times[k])
      ) |>
      fmt_number(columns = c(Theoretical, Simulated, Difference), decimals = 2) |>
      fmt_number(columns = Rel_Diff_Pct, decimals = 1) |>
      cols_label(
        Metric = "",
        Theoretical = "Theoretical",
        Simulated = "Simulated",
        Difference = "Difference",
        Rel_Diff_Pct = "Rel. Diff (%)"
      ) |>
      tab_row_group(label = md("**Information**"), rows = grepl("Information|Variance", Metric)) |>
      tab_row_group(label = md("**Events**"), rows = grepl("Events", Metric)) |>
      tab_row_group(label = md("**Exposure**"), rows = grepl("Exposure", Metric)) |>
      tab_row_group(label = md("**Sample Size**"), rows = grepl("^N ", Metric)) |>
      row_group_order(groups = c("**Sample Size**", "**Exposure**", "**Events**", "**Information**"))
  )
  
  # Add boundary crossing summary
  sub_dt <- dt[analysis == k]
  cat(sprintf("\n**Boundary Crossing:**\n"))
  cat(sprintf("- Efficacy (upper): %.1f%% (n=%d)\n", 
              mean(sub_dt$cross_upper) * 100, sum(sub_dt$cross_upper)))
  cat(sprintf("- Futility (lower): %.1f%% (n=%d)\n", 
              mean(sub_dt$cross_lower) * 100, sum(sub_dt$cross_lower)))
  cat(sprintf("- Cumulative Efficacy: %.1f%%\n\n", 
              sum(dt[analysis <= k]$cross_upper) / n_sims * 100))
}

## ----overall-summary, eval=!is.null(results)----------------------------------
cat("=== Overall Operating Characteristics ===\n")
cat(sprintf("Number of simulations: %d\n", n_sims))
cat(sprintf("Overall Power (P[reject H0]): %.1f%% (SE: %.1f%%)\n", 
            summary_gs$power * 100, 
            sqrt(summary_gs$power * (1 - summary_gs$power) / n_sims) * 100))
cat(sprintf("Futility Stopping Rate: %.1f%%\n", summary_gs$futility * 100))
cat(sprintf("Design Power (target): %.1f%%\n", (1 - gs_nb$beta) * 100))

## ----power-comparison, eval=!is.null(results)---------------------------------
# Create comparison table
crossing_summary <- data.frame(
  Analysis = 1:3,
  Analysis_Time = params$analysis_times,
  Sim_Power = summary_gs$analysis_summary$prob_cross_upper,
  Sim_Cum_Power = summary_gs$analysis_summary$cum_prob_upper,
  Design_Cum_Power = cumsum(gs_nb$upper$prob[, 2])
)

crossing_summary |>
  gt() |>
  tab_header(
    title = "Power Comparison: Simulation vs Design",
    subtitle = sprintf("Based on %d simulated trials", n_sims)
  ) |>
  cols_label(
    Analysis = "Analysis",
    Analysis_Time = "Month",
    Sim_Power = "Incremental Power (Sim)",
    Sim_Cum_Power = "Cumulative Power (Sim)",
    Design_Cum_Power = "Cumulative Power (Design)"
  ) |>
  fmt_percent(columns = c(Sim_Power, Sim_Cum_Power, Design_Cum_Power), decimals = 1)

## ----plot-z-stats, fig.width=7, fig.height=5, fig.alt="Z-statistics across analyses with group sequential boundaries", eval=!is.null(results)----
# Prepare data for plotting
plot_data <- results
plot_data$z_flipped <- -plot_data$z_stat # Flip for efficacy direction

# Boundary data
bounds_df <- data.frame(
  analysis = 1:gs_nb$k,
  upper = gs_nb$upper$bound,
  lower = gs_nb$lower$bound
)

ggplot(plot_data, aes(x = factor(analysis), y = z_flipped)) +
  geom_violin(fill = "steelblue", alpha = 0.5, color = "steelblue") +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  # Draw bounds as lines connecting analyses
  geom_line(
    data = bounds_df, aes(x = analysis, y = upper, group = 1),
    linetype = "dashed", color = "darkgreen", linewidth = 1
  ) +
  geom_line(
    data = bounds_df, aes(x = analysis, y = lower, group = 1),
    linetype = "dashed", color = "darkred", linewidth = 1
  ) +
  # Draw points for bounds
  geom_point(data = bounds_df, aes(x = analysis, y = upper), color = "darkgreen") +
  geom_point(data = bounds_df, aes(x = analysis, y = lower), color = "darkred") +
  geom_hline(yintercept = 0, color = "gray50") +
  labs(
    title = "Simulated Z-Statistics by Analysis",
    subtitle = "Green dashed = efficacy bound, Red dashed = futility bound",
    x = "Analysis",
    y = "Z-statistic (positive = favors experimental)"
  ) +
  theme_minimal() +
  ylim(c(-4, 6))