## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6
)

## ----setup, message = FALSE---------------------------------------------------
library(pepdiff)
library(dplyr)

## ----generate-good-data-------------------------------------------------------
set.seed(123)

n_peptides <- 40
n_reps <- 5

peptides <- paste0("PEP_", sprintf("%03d", 1:n_peptides))
genes <- paste0("GENE_", LETTERS[((1:n_peptides - 1) %% 26) + 1])

# Simulate well-behaved Gamma data
design <- expand.grid(
  peptide = peptides,
  treatment = c("ctrl", "trt"),
  bio_rep = 1:n_reps,
  stringsAsFactors = FALSE
)

sim_data <- design %>%
  mutate(
    gene_id = genes[match(peptide, peptides)],
    pep_num = as.numeric(gsub("PEP_", "", peptide)),
    base = 10 + pep_num * 0.5,
    # First 20 peptides have 2-fold treatment effect
    effect = ifelse(pep_num <= 20 & treatment == "trt", 2, 1),
    # Gamma noise (well-behaved)
    value = rgamma(n(), shape = 15, rate = 15 / (base * effect))
  ) %>%
  select(peptide, gene_id, treatment, bio_rep, value)

# Import
temp_file <- tempfile(fileext = ".csv")
write.csv(sim_data, temp_file, row.names = FALSE)

dat <- read_pepdiff(
  temp_file,
  id = "peptide",
  gene = "gene_id",
  value = "value",
  factors = "treatment",
  replicate = "bio_rep"
)

unlink(temp_file)

## ----run-glm-good, message = FALSE--------------------------------------------
results <- compare(dat, compare = "treatment", ref = "ctrl", method = "glm")

# Check fit diagnostics
diag <- plot_fit_diagnostics(results)

## ----show-good-plot, fig.height = 8-------------------------------------------
diag$plot

## ----view-flagged-------------------------------------------------------------
diag$flagged

## ----summary-stats------------------------------------------------------------
# Summary statistics
diag$summary

## ----generate-bad-data--------------------------------------------------------
set.seed(456)

# Same design but with outliers and heavy tails
sim_bad <- design %>%
  mutate(
    gene_id = genes[match(peptide, peptides)],
    pep_num = as.numeric(gsub("PEP_", "", peptide)),
    base = 10 + pep_num * 0.5,
    effect = ifelse(pep_num <= 20 & treatment == "trt", 2, 1),
    # Add outliers (10% of observations)
    outlier = ifelse(runif(n()) < 0.1, runif(n(), 3, 8), 1),
    # Heavier-tailed noise
    value = rgamma(n(), shape = 3, rate = 3 / (base * effect)) * outlier
  ) %>%
  select(peptide, gene_id, treatment, bio_rep, value)

temp_file <- tempfile(fileext = ".csv")
write.csv(sim_bad, temp_file, row.names = FALSE)

dat_bad <- read_pepdiff(
  temp_file,
  id = "peptide",
  gene = "gene_id",
  value = "value",
  factors = "treatment",
  replicate = "bio_rep"
)

unlink(temp_file)

## ----run-glm-bad, message = FALSE---------------------------------------------
results_bad <- compare(dat_bad, compare = "treatment", ref = "ctrl", method = "glm")
diag_bad <- plot_fit_diagnostics(results_bad)

## ----show-bad-plot, fig.height = 8--------------------------------------------
diag_bad$plot

## ----try-art, message = FALSE-------------------------------------------------
results_art <- compare(dat_bad, compare = "treatment", ref = "ctrl", method = "art")

# Compare significant calls
comparison <- tibble(
  peptide = results_bad$results$peptide,
  glm_sig = results_bad$results$significant,
  art_sig = results_art$results$significant
)

cat("Both significant:", sum(comparison$glm_sig & comparison$art_sig), "\n")
cat("GLM only:", sum(comparison$glm_sig & !comparison$art_sig), "\n")
cat("ART only:", sum(!comparison$glm_sig & comparison$art_sig), "\n")

