## ----setup-options, include = FALSE-------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.align = "center",
  warning = FALSE,
  message = FALSE,
  out.width = "100%",
  dpi = 150
)

## ----setup, message=FALSE-----------------------------------------------------
library(PubMatrixR)
library(knitr)
library(kableExtra)
library(dplyr)
library(pheatmap)
library(ggplot2)

## ----eval=FALSE---------------------------------------------------------------
# result <- PubMatrix(
#   A = gene_set_1,
#   B = gene_set_2,
#   API.key = "your_api_key_here",
#   Database = "pubmed"
# )

## ----gene_extraction, eval = FALSE--------------------------------------------
# # Extract WNT-related genes
# A <- msigdf::msigdf.human %>%
#   dplyr::filter(grepl(geneset, pattern = "wnt", ignore.case = TRUE)) %>%
#   dplyr::pull(symbol) %>%
#   unique()
# 
# # Extract obesity-related genes
# B <- msigdf::msigdf.human %>%
#   dplyr::filter(grepl(geneset, pattern = "obesity", ignore.case = TRUE)) %>%
#   dplyr::pull(symbol) %>%
#   unique()
# 
# # Sample genes for demonstration (making them equal in length)
# A <- sample(A, 10, replace = FALSE)
# B <- sample(B, 10, replace = FALSE)

## ----example_genes------------------------------------------------------------
# WNT signaling pathway genes
A <- c("WNT1", "WNT2", "WNT3A", "WNT5A", "WNT7B", "CTNNB1", "DVL1")

# Obesity-related genes
B <- c("LEPR", "ADIPOQ", "PPARG", "TNF", "IL6", "ADRB2", "INSR")

## ----pubmatrix_analysis, eval = FALSE-----------------------------------------
# # Intentionally not run during package checks: this performs live NCBI queries.
# # Run actual PubMatrix analysis
# current_year <- as.integer(format(Sys.Date(), "%Y"))
# result <- PubMatrix(
#   A = A,
#   B = B,
#   Database = "pubmed",
#   daterange = c(2020, current_year),
#   outfile = "pubmatrix_result"
# )

## ----pubmatrix_analysis_offline-----------------------------------------------
# Offline deterministic example used for vignette rendering/package checks.
result <- outer(seq_along(B), seq_along(A), function(i, j) {
  8 + (i * 6) + (j * 5) + ((i + j) %% 4) * 3 + ((i * j) %% 5)
})
result <- as.data.frame(result, check.names = FALSE)
colnames(result) <- A
rownames(result) <- B

## ----results_table------------------------------------------------------------
kable(result,
  caption = "Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)",
  align = "c",
  format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  ) %>%
  kableExtra::add_header_above(c(" " = 1, "Obesity Genes" = length(B)))

## ----bar_plots, fig.width=10, fig.height=7, out.width="100%", dpi=150---------
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data <- data.frame(
  gene = rownames(result),
  total_pubs = rowSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with B genes
a_genes_data$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data$max_overlap <- apply(result, 1, max)

# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data <- data.frame(
  gene = colnames(result),
  total_pubs = colSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with A genes
b_genes_data$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data$max_overlap <- apply(result, 2, max)

bar_plot_theme <- theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9),
    legend.box = "vertical"
  )

n_fill_a <- length(unique(a_genes_data$max_b_gene))
n_fill_b <- length(unique(b_genes_data$max_a_gene))

# Plot A genes colored by their strongest B gene partner
p1 <- ggplot(a_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List A Genes by Publication Count",
    subtitle = "Colored by strongest List B partner",
    x = "Genes (List A)",
    y = "Total Publications",
    fill = "Strongest B Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a / 4)), byrow = TRUE)) +
  bar_plot_theme


# Plot B genes colored by their strongest A gene partner
p2 <- ggplot(b_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List B Genes by Publication Count",
    subtitle = "Colored by strongest List A partner",
    x = "Genes (List B)",
    y = "Total Publications",
    fill = "Strongest A Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b / 4)), byrow = TRUE)) +
  bar_plot_theme


print(p1)
print(p2)

## ----heatmap_with_numbers_asymmetric, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Overlap percentage heatmap with values displayed in each cell"----
plot_pubmatrix_heatmap(result,
  title = "WNT-Obesity Overlap (%)",
  show_numbers = TRUE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)

## ----heatmap_clean_asymmetric, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Co-occurrence heatmap without numbers for better visual clarity"----
plot_pubmatrix_heatmap(result,
  title = "WNT-Obesity Co-occurrence (Clean)",
  show_numbers = FALSE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)

## ----asymmetric_example, eval = FALSE-----------------------------------------
# # Intentionally not run during package checks: this performs live NCBI queries.
# A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
# 
# 
# B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")
# 
# 
# # Run actual PubMatrix analysis
# current_year <- as.integer(format(Sys.Date(), "%Y"))
# result <- PubMatrix(
#   A = A,
#   B = B,
#   Database = "pubmed",
#   daterange = c(2020, current_year),
#   outfile = "pubmatrix_result"
# )

## ----asymmetric_example_offline-----------------------------------------------
# Offline deterministic example used for vignette rendering/package checks.
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")

result <- outer(seq_along(B), seq_along(A), function(i, j) {
  12 + (i * 4) + (j * 3) + ((i * j) %% 9) + ((i + 2 * j) %% 6)
})
result <- as.data.frame(result, check.names = FALSE)
colnames(result) <- A
rownames(result) <- B

## ----results_table_asymmetric-------------------------------------------------
kable(result,
  caption = "Co-occurrence Matrix: Longer Lists (Publication Counts)",
  align = "c",
  format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  ) %>%
  kableExtra::add_header_above(c(" " = 1, "A Genes" = length(A)))

## ----bar_plots_asymmetric, fig.width=10, fig.height=8, out.width="100%", dpi=150----
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data2 <- data.frame(
  gene = rownames(result),
  total_pubs = rowSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with B genes
a_genes_data2$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data2$max_overlap <- apply(result, 1, max)

# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data2 <- data.frame(
  gene = colnames(result),
  total_pubs = colSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with A genes
b_genes_data2$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data2$max_overlap <- apply(result, 2, max)

bar_plot_theme <- theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8.5),
    legend.box = "vertical"
  )

n_fill_a2 <- length(unique(a_genes_data2$max_b_gene))
n_fill_b2 <- length(unique(b_genes_data2$max_a_gene))

# Plot A genes colored by their strongest B gene partner
p3 <- ggplot(a_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List A Genes by Publication Count (Asymmetric)",
    subtitle = "Colored by strongest List B partner",
    x = "Genes (List A)",
    y = "Total Publications",
    fill = "Strongest B Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a2 / 4)), byrow = TRUE)) +
  bar_plot_theme


# Plot B genes colored by their strongest A gene partner
p4 <- ggplot(b_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List B Genes by Publication Count (Asymmetric)",
    subtitle = "Colored by strongest List A partner",
    x = "Genes (List B)",
    y = "Total Publications",
    fill = "Strongest A Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b2 / 4)), byrow = TRUE)) +
  bar_plot_theme


print(p3)
print(p4)

## ----heatmap_with_numbers_asymmetric2, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Overlap percentage heatmap with values displayed in each cell"----
plot_pubmatrix_heatmap(result,
  title = "Asymmetric Lists Overlap (%)",
  show_numbers = TRUE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)

## ----heatmap_clean_asymmetric2, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Co-occurrence heatmap without numbers for better visual clarity"----
plot_pubmatrix_heatmap(result,
  title = "Asymmetric Lists Co-occurrence (Clean)",
  show_numbers = FALSE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)

## ----system_info--------------------------------------------------------------
sessionInfo()

