## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----eval = FALSE-------------------------------------------------------------
# # Install from GitHub
# devtools::install_github("TeamMacLean/pepdiff")

## ----eval = FALSE-------------------------------------------------------------
# # For heatmaps
# BiocManager::install("ComplexHeatmap")
# 
# # For rank products test
# BiocManager::install("RankProd")

## ----setup, message = FALSE---------------------------------------------------
library(pepdiff)
library(dplyr)

## ----generate-data------------------------------------------------------------
set.seed(123)

# Experimental design
# 10 reps and 3-fold changes give us good power to detect true effects
n_peptides <- 50
n_reps <- 10
conditions <- c("ctrl", "trt")

# Generate peptide names and gene IDs
peptides <- paste0("PEP_", sprintf("%03d", 1:n_peptides))
genes <- paste0("GENE_", LETTERS[((1:n_peptides - 1) %% 26) + 1])

# 30% of peptides (15 out of 50) will be truly differentially abundant
n_diff <- round(n_peptides * 0.3)
diff_peptides <- sample(peptides, n_diff)
cat("Truly differential peptides:", n_diff, "\n")

# Generate abundance data
# Proteomics abundances are typically right-skewed (can't go below zero,
# multiplicative processes). We use a Gamma distribution to reflect this.
sim_data <- expand.grid(
  peptide = peptides,
  treatment = conditions,
  bio_rep = 1:n_reps,
  stringsAsFactors = FALSE
) %>%
  mutate(
    gene_id = genes[match(peptide, peptides)],
    # Base abundance varies by peptide (some peptides more abundant than others)
    base_abundance = rep(rgamma(n_peptides, shape = 5, rate = 0.5), each = length(conditions) * n_reps),
    # Treatment effect for differential peptides (~3-fold change up or down)
    fc = ifelse(peptide %in% diff_peptides & treatment == "trt",
                sample(c(0.33, 3), n_diff * n_reps, replace = TRUE, prob = c(0.3, 0.7)),
                1),
    # Final value with biological noise
    value = rgamma(n(), shape = 10, rate = 10 / (base_abundance * fc))
  ) %>%
  select(peptide, gene_id, treatment, bio_rep, value)

head(sim_data)

## ----write-temp-csv-----------------------------------------------------------
temp_file <- tempfile(fileext = ".csv")
write.csv(sim_data, temp_file, row.names = FALSE)

## ----import-data--------------------------------------------------------------
dat <- read_pepdiff(
  file = temp_file,
  id = "peptide",           # peptide identifiers
  gene = "gene_id",         # gene identifiers
  value = "value",          # abundance values
  factors = "treatment",    # experimental factors
  replicate = "bio_rep"     # biological replicate IDs
)

dat

## ----summary-data-------------------------------------------------------------
summary(dat)

## ----plot-data, fig.height = 6------------------------------------------------
plot(dat)

## ----compare------------------------------------------------------------------
results <- compare(
  dat,
  compare = "treatment",  # which factor to test
  ref = "ctrl"            # reference level (what to compare against)
)

results

## ----results-structure--------------------------------------------------------
# The main results table
head(results$results)

## ----significant--------------------------------------------------------------
sig_peptides <- significant(results)
sig_peptides

## ----plot-results, fig.height = 6---------------------------------------------
plot(results)

## ----check-results------------------------------------------------------------
# True positives: correctly identified as significant
true_pos <- sig_peptides$peptide[sig_peptides$peptide %in% diff_peptides]

# False positives: called significant but not truly differential
false_pos <- sig_peptides$peptide[!sig_peptides$peptide %in% diff_peptides]

# False negatives: truly differential but not called significant
false_neg <- diff_peptides[!diff_peptides %in% sig_peptides$peptide]

cat("True positives:", length(true_pos), "\n")
cat("False positives:", length(false_pos), "\n")
cat("False negatives:", length(false_neg), "\n")

## ----cleanup, include = FALSE-------------------------------------------------
unlink(temp_file)

