## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval = FALSE-------------------------------------------------------------
# # if (!require("remotes", quietly = TRUE))
# #     install.packages("remotes")
# #
# # remotes::install_github("statdivlab/radEmu")

## ----setup, message = FALSE---------------------------------------------------
library(magrittr)
library(dplyr)
library(ggplot2)
library(stringr)
library(radEmu)

## -----------------------------------------------------------------------------
data("wirbel_sample")
dim(wirbel_sample)
head(wirbel_sample)

## -----------------------------------------------------------------------------
wirbel_sample %>%
  group_by(Group) %>%
  summarize(count = n())

## -----------------------------------------------------------------------------
wirbel_sample %>%
  group_by(Country) %>%
  summarize(count = n())

## -----------------------------------------------------------------------------
wirbel_sample %>%
  group_by(Country, Group) %>%
  summarize(n = n())

## -----------------------------------------------------------------------------
data("wirbel_otu")
dim(wirbel_otu)
# let's check out a subset
wirbel_otu[1:5, 1:3]

## -----------------------------------------------------------------------------
mOTU_names <- colnames(wirbel_otu)

## -----------------------------------------------------------------------------
wirbel_sample$Group <- factor(wirbel_sample$Group, levels = c("CTR","CRC"))

## -----------------------------------------------------------------------------
chosen_genera <- c("Eubacterium", "Faecalibacterium", "Fusobacterium", "Porphyromonas")
mOTU_name_df <- data.frame(name = mOTU_names) %>% 
  mutate(base_name = stringr::str_remove(mOTU_names, "unknown ") %>%
                      stringr::str_remove("uncultured ")) %>%
  mutate(genus_name = stringr::word(base_name, 1))
restricted_mOTU_names <- mOTU_name_df %>%
  filter(genus_name %in% chosen_genera) %>%
  pull(name)

## -----------------------------------------------------------------------------
ch_study_obs <- which(wirbel_sample$Country %in% c("CHI"))

## -----------------------------------------------------------------------------
small_Y <- wirbel_otu[ch_study_obs, restricted_mOTU_names]
sum(rowSums(small_Y) == 0) # no samples have a count sum of 0 
sum(colSums(small_Y) == 0) # one category has a count sum of 0

category_to_rm <- which(colSums(small_Y) == 0)
small_Y <- small_Y[, -category_to_rm]
sum(colSums(small_Y) == 0)

## -----------------------------------------------------------------------------
ch_fit <- emuFit(formula = ~ Group, 
                 data = wirbel_sample[ch_study_obs, ],
                 Y = small_Y,
                 run_score_tests = FALSE) 

## -----------------------------------------------------------------------------
ch_fit

## ----fig.height = 6, fig.width = 6--------------------------------------------
plot(ch_fit)$plots

## ----fig.height = 6, fig.width = 6--------------------------------------------
#create data frame that has simplified names for taxa
taxa_names <- data.frame("category" = ch_fit$coef$category) %>%
  mutate(cat_small = stringr::str_remove(paste0("mOTU_", 
                            stringr::str_split(category, 'mOTU_v2_', simplify = TRUE)[, 2]), 
                            "\\]"))


#produce plot with cleaner taxa labels
plot(ch_fit, taxon_names = taxa_names)$plots

## -----------------------------------------------------------------------------
taxa_to_test <- c(which(str_detect(restricted_mOTU_names, "0777")), 
                  which(str_detect(restricted_mOTU_names, "7116")))
design <- make_design_matrix(formula = ~ Group, data = wirbel_sample[ch_study_obs, ])
colnames(design)
covariate_to_test <- 2 # we see in the previous line that the covariate we care about corresponds to the second column of the design matrix
ch_fit$B %>% rownames
two_robust_score_tests <- emuFit(formula = ~ Group,
                                 data = wirbel_sample[ch_study_obs, ],
                                 B = ch_fit$B,
                                 test_kj = data.frame(k = covariate_to_test, 
                                                      j = taxa_to_test), 
                                 Y = small_Y)

## -----------------------------------------------------------------------------
two_robust_score_tests$coef[taxa_to_test, c("covariate", "category", "estimate", "pval")]

## -----------------------------------------------------------------------------
data.frame(counts = wirbel_otu[ch_study_obs, "unknown Eubacterium [meta_mOTU_v2_7116]"],
           group = wirbel_sample$Group[ch_study_obs]) %>%
  mutate(eubact_present = counts > 0) %>%
  group_by(group, eubact_present) %>%
  count()

## -----------------------------------------------------------------------------
data.frame(counts = wirbel_otu[ch_study_obs, 
                               "Fusobacterium nucleatum s. nucleatum [ref_mOTU_v2_0777]"],
           group = wirbel_sample$Group[ch_study_obs]) %>%
  mutate(fuso_present = counts > 0) %>%
  group_by(group, fuso_present) %>%
  count()

## ----eval = FALSE-------------------------------------------------------------
# test_all <- emuFit(formula = ~ Group,
#                    data = wirbel_sample[ch_study_obs, ],
#                    B = ch_fit$B,
#                    Y = small_Y,
#                    run_score_tests = TRUE,
#                    test_kj = data.frame(k = covariate_to_test,
#                                         j = 1:ncol(small_Y)))

## ----eval=FALSE---------------------------------------------------------------
# all_fit <- emuFit(formula = ~ Group + Study + Gender +
#                     Age_spline.1 + Age_spline.2 +
#                     BMI_spline.1 + BMI_spline.2 + Sampling,
#                   data = wirbel_sample,
#                   Y = wirbel_otu[, restricted_mOTU_names],
#                   run_score_tests = FALSE)

## ----eval = FALSE-------------------------------------------------------------
# age_spline <- splines2::bSpline(wirbel_sample$Age, degree = 1, knots = median(wirbel_sample$Age))
# age_spline[,1] <- (age_spline[,1] - mean(age_spline[,1]))/sd(age_spline[,1])
# age_spline[,2] <- (age_spline[,2] - mean(age_spline[,2]))/sd(age_spline[,2])
# 
# bmi_spline <- splines2::bSpline(wirbel_sample$BMI, degree = 1, knots = median(wirbel_sample$BMI))
# bmi_spline[,1] <- (bmi_spline[,1] - mean(bmi_spline[,1]))/sd(bmi_spline[,1])
# bmi_spline[,2] <- (bmi_spline[,2] - mean(bmi_spline[,2]))/sd(bmi_spline[,2])