## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup, message = FALSE---------------------------------------------------
library(pepdiff)
library(dplyr)
library(ggplot2)

## ----generate-good-data-------------------------------------------------------
set.seed(123)

# Good data: clean separation, no outliers, balanced
# Using 10 reps and 3-fold changes for good power
make_good_data <- function() {
  n_peptides <- 40
  n_reps <- 10

  peptides <- paste0("PEP_", sprintf("%03d", 1:n_peptides))
  genes <- paste0("GENE_", LETTERS[((1:n_peptides - 1) %% 26) + 1])
  diff_peptides <- peptides[1:12]  # 30% differential

  sim_data <- expand.grid(
    peptide = peptides,
    treatment = c("ctrl", "trt"),
    bio_rep = 1:n_reps,
    stringsAsFactors = FALSE
  ) %>%
    mutate(
      gene_id = genes[match(peptide, peptides)],
      base = rep(rgamma(n_peptides, shape = 8, rate = 0.8), each = 2 * n_reps),
      effect = ifelse(peptide %in% diff_peptides & treatment == "trt",
                      sample(c(0.33, 3), length(diff_peptides) * n_reps, replace = TRUE),
                      1),
      value = rgamma(n(), shape = 20, rate = 20 / (base * effect))
    ) %>%
    select(peptide, gene_id, treatment, bio_rep, value)

  temp_file <- tempfile(fileext = ".csv")
  write.csv(sim_data, temp_file, row.names = FALSE)

  read_pepdiff(temp_file, id = "peptide", gene = "gene_id",
               value = "value", factors = "treatment", replicate = "bio_rep")
}

good_data <- make_good_data()

## ----generate-bad-data--------------------------------------------------------
# Problematic data: outlier sample, batch effect, systematic missingness
make_bad_data <- function() {
  n_peptides <- 40
  n_reps <- 10

  peptides <- paste0("PEP_", sprintf("%03d", 1:n_peptides))
  genes <- paste0("GENE_", LETTERS[((1:n_peptides - 1) %% 26) + 1])

  sim_data <- expand.grid(
    peptide = peptides,
    treatment = c("ctrl", "trt"),
    bio_rep = 1:n_reps,
    stringsAsFactors = FALSE
  ) %>%
    mutate(
      gene_id = genes[match(peptide, peptides)],
      base = rep(rgamma(n_peptides, shape = 5, rate = 0.5), each = 2 * n_reps),
      # Outlier: one control sample has 3x higher values
      batch_effect = ifelse(treatment == "ctrl" & bio_rep == 1, 3, 1),
      value = rgamma(n(), shape = 10, rate = 10 / (base * batch_effect))
    ) %>%
    select(peptide, gene_id, treatment, bio_rep, value)

  # Add systematic missingness: low-abundance peptides missing in treatment
  low_abundance <- peptides[31:40]
  sim_data <- sim_data %>%
    mutate(value = ifelse(peptide %in% low_abundance & treatment == "trt" & runif(n()) < 0.7,
                          NA, value))

  temp_file <- tempfile(fileext = ".csv")
  write.csv(sim_data, temp_file, row.names = FALSE)

  read_pepdiff(temp_file, id = "peptide", gene = "gene_id",
               value = "value", factors = "treatment", replicate = "bio_rep")
}

bad_data <- make_bad_data()

## ----pca-good, fig.height = 4-------------------------------------------------
plot_pca_simple(good_data)

## ----pca-bad, fig.height = 4--------------------------------------------------
plot_pca_simple(bad_data)

## ----dist-good, fig.height = 4------------------------------------------------
plot_distributions_simple(good_data)

## ----dist-bad, fig.height = 4-------------------------------------------------
plot_distributions_simple(bad_data)

## ----miss-good, fig.height = 4------------------------------------------------
plot_missingness_simple(good_data)

## ----miss-bad, fig.height = 4-------------------------------------------------
plot_missingness_simple(bad_data)

## ----run-analyses-------------------------------------------------------------
good_results <- compare(good_data, compare = "treatment", ref = "ctrl")
bad_results <- compare(bad_data, compare = "treatment", ref = "ctrl")

## ----volcano-good, fig.height = 4---------------------------------------------
plot_volcano_new(good_results)

## ----volcano-bad, fig.height = 4----------------------------------------------
plot_volcano_new(bad_results)

## ----pval-good, fig.height = 4------------------------------------------------
plot_pvalue_histogram(good_results)

## ----pval-bad, fig.height = 4-------------------------------------------------
plot_pvalue_histogram(bad_results)

## ----fc-good, fig.height = 4--------------------------------------------------
plot_fc_distribution_new(good_results)

## ----fc-bad, fig.height = 4---------------------------------------------------
plot_fc_distribution_new(bad_results)

## ----export-example, eval = FALSE---------------------------------------------
# p <- plot_volcano_new(good_results) +
#   labs(title = "Treatment vs Control") +
#   theme_minimal(base_size = 14)
# 
# ggsave("volcano.pdf", p, width = 6, height = 5)

## ----customise-example, fig.height = 4----------------------------------------
plot_volcano_new(good_results) +
  theme_classic() +
  labs(title = "My Custom Volcano Plot")