## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  warning = FALSE,
  message = FALSE,
  fig.alt = "Canadian Census example visualization of Bayesian surprise or funnel plot values."
)

is_pkgdown <- identical(Sys.getenv("IN_PKGDOWN"), "true")
has_cancensus_key <- nzchar(Sys.getenv("CM_API_KEY")) ||
  nzchar(Sys.getenv("CANCENSUS_API_KEY"))
run_cancensus_examples <- is_pkgdown && has_cancensus_key

## ----load-packages------------------------------------------------------------
library(bayesiansurpriser)
library(cancensus)
library(sf)
library(ggplot2)
library(dplyr)

# Set your CensusMapper API key (do this once)
# set_cancensus_api_key("YOUR_KEY_HERE", install = TRUE)

# Optionally set a cache path for faster subsequent queries
# set_cancensus_cache_path("path/to/cache", install = TRUE)

## ----list-datasets, eval=FALSE------------------------------------------------
# # List available Census datasets
# datasets <- list_census_datasets()
# print(datasets %>% select(dataset, description))

## ----list-regions, eval=FALSE-------------------------------------------------
# # List regions for the 2021 Census
# regions_2021 <- list_census_regions("CA21")
# head(regions_2021 %>% filter(level == "CMA"))

## ----fetch-income-data, eval=run_cancensus_examples---------------------------
# # Find relevant vectors for low income
# # v_CA21_1085: Total - Low-income status in 2020
# # v_CA21_1086: 0 to 17 years in low income
# # v_CA21_1090: In low income
# 
# # Get low income data at the Census Division level for all of Canada
# # Using labels = "short" for cleaner column names
# income_data <- get_census(
#   dataset = "CA21",
#   regions = list(C = "01"),  # All of Canada
#   vectors = c(
#     total_pop = "v_CA21_1085",  # Total population for low income status
#     low_income = "v_CA21_1090"   # Population in low income
#   ),
#   level = "CD",
#   geo_format = "sf",
#   labels = "short",
#   quiet = TRUE
# )
# 
# # Clean and filter
# income_data <- income_data %>%
#   filter(!is.na(total_pop) & !is.na(low_income)) %>%
#   filter(total_pop > 0) %>%
#   mutate(
#     low_income_rate = low_income / total_pop
#   )
# 
# head(income_data %>% st_drop_geometry() %>% select(GeoUID, `Region Name`, total_pop, low_income, low_income_rate))

## ----compute-income-surprise, eval=run_cancensus_examples---------------------
# # Compute surprise on the low-income-rate distribution.
# income_surprise <- surprise(
#   income_data,
#   observed = low_income_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# 
# print(income_surprise)
# summary(income_surprise)

## ----plot-income-surprise, fig.width=10, fig.height=8, eval=run_cancensus_examples----
# # Add surprise values to data
# income_data$surprise <- get_surprise(income_surprise, "surprise")
# income_data$signed_surprise <- get_surprise(income_surprise, "signed")
# 
# # Transform to Statistics Canada Lambert projection for better Canada visualization
# income_data_lambert <- st_transform(income_data, "EPSG:3347")
# 
# # Plot Canada-wide
# ggplot(income_data_lambert) +
#   geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) +
#   scale_fill_surprise_diverging(name = "Signed\nsurprise") +
#   labs(
#     title = "Atypical Low Income Rates by Census Division",
#     subtitle = "Signed surprise from the cross-division low-income-rate distribution",
#     caption = "Source: Statistics Canada, 2021 Census"
#   ) +
#   theme_void() +
#   theme(
#     legend.position = "right",
#     plot.title = element_text(hjust = 0.5, face = "bold"),
#     plot.subtitle = element_text(hjust = 0.5)
#   )

## ----top-surprise-regions, eval=run_cancensus_examples------------------------
# # Top regions with positive signed surprise
# high_income_surprise <- income_data %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise > 0) %>%
#   arrange(desc(surprise)) %>%
#   select(`Region Name`, low_income_rate, total_pop, surprise) %>%
#   head(10)
# 
# cat("Census Divisions with positive signed surprise:\n")
# print(high_income_surprise)
# 
# # Top regions with negative signed surprise
# low_income_surprise <- income_data %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise < 0) %>%
#   arrange(desc(surprise)) %>%
#   select(`Region Name`, low_income_rate, total_pop, surprise) %>%
#   head(10)
# 
# cat("\nCensus Divisions with negative signed surprise:\n")
# print(low_income_surprise)

## ----fetch-housing-data, eval=run_cancensus_examples--------------------------
# # Get housing tenure data for Vancouver CMA
# # v_CA21_4237: Total private households by tenure
# # v_CA21_4238: Owner households
# 
# housing_data <- get_census(
#   dataset = "CA21",
#   regions = list(CMA = "59933"),  # Vancouver CMA
#   vectors = c(
#     total_hh = "v_CA21_4237",  # Total households
#     owner_hh = "v_CA21_4238"   # Owner households
#   ),
#   level = "CT",
#   geo_format = "sf",
#   labels = "short",
#   quiet = TRUE
# )
# 
# housing_data <- housing_data %>%
#   filter(!is.na(total_hh) & !is.na(owner_hh)) %>%
#   filter(total_hh > 50) %>%  # Filter small tracts
#   mutate(
#     owner_rate = owner_hh / total_hh
#   )
# 
# cat("Number of Census Tracts:", nrow(housing_data), "\n")

## ----housing-surprise, fig.width=9, fig.height=7, eval=run_cancensus_examples----
# # Compute surprise on the home-ownership-rate distribution.
# housing_surprise <- surprise(
#   housing_data,
#   observed = owner_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# 
# housing_data$signed_surprise <- get_surprise(housing_surprise, "signed")
# 
# # Plot Vancouver
# ggplot(housing_data) +
#   geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
#   scale_fill_surprise_diverging(name = "Signed\nsurprise") +
#   labs(
#     title = "Atypical Home Ownership Rates in Metro Vancouver",
#     subtitle = "Signed surprise from the tract-level owner-rate distribution",
#     caption = "Source: Statistics Canada, 2021 Census"
#   ) +
#   theme_void() +
#   theme(
#     legend.position = "right",
#     plot.title = element_text(hjust = 0.5, face = "bold"),
#     plot.subtitle = element_text(hjust = 0.5, size = 9)
#   )

## ----fetch-language-data, eval=run_cancensus_examples-------------------------
# # Get language data for Toronto CMA
# # v_CA21_1144: Total - Knowledge of official languages
# # v_CA21_1153: Neither English nor French
# 
# language_data <- get_census(
#   dataset = "CA21",
#   regions = list(CMA = "35535"),  # Toronto CMA
#   vectors = c(
#     total_pop = "v_CA21_1144",  # Total population
#     no_official = "v_CA21_1153"   # Neither official language
#   ),
#   level = "CT",
#   geo_format = "sf",
#   labels = "short",
#   quiet = TRUE
# )
# 
# language_data <- language_data %>%
#   filter(!is.na(total_pop) & !is.na(no_official)) %>%
#   filter(total_pop > 100) %>%
#   mutate(no_official_rate = no_official / total_pop)

## ----language-surprise, fig.width=9, fig.height=7, eval=run_cancensus_examples----
# # Compute surprise on the non-official-language-rate distribution.
# lang_surprise <- surprise(
#   language_data,
#   observed = no_official_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# 
# language_data$signed_surprise <- get_surprise(lang_surprise, "signed")
# 
# # Plot Toronto
# ggplot(language_data) +
#   geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
#   scale_fill_surprise_diverging(name = "Signed\nsurprise") +
#   labs(
#     title = "Atypical Non-Official Language Concentrations in Toronto",
#     subtitle = "Signed surprise from the tract-level non-official-language-rate distribution",
#     caption = "Source: Statistics Canada, 2021 Census"
#   ) +
#   theme_void() +
#   theme(
#     legend.position = "right",
#     plot.title = element_text(hjust = 0.5, face = "bold"),
#     plot.subtitle = element_text(hjust = 0.5, size = 9)
#   )

## ----provincial-funnel, eval=run_cancensus_examples---------------------------
# # Get provincial data
# provincial_data <- get_census(
#   dataset = "CA21",
#   regions = list(C = "01"),
#   vectors = c(
#     total_pop = "v_CA21_1085",  # Total for low income
#     low_income = "v_CA21_1090"   # In low income
#   ),
#   level = "PR",
#   geo_format = NA,
#   labels = "short",
#   quiet = TRUE
# )
# 
# provincial_data <- provincial_data %>%
#   filter(!is.na(total_pop) & total_pop > 0)
# 
# # Compute funnel data for points
# funnel_df <- compute_funnel_data(
#   observed = provincial_data$low_income,
#   sample_size = provincial_data$total_pop,
#   type = "count",
#   limits = c(2, 3)
# )
# funnel_df$province <- provincial_data$`Region Name`
# funnel_df$rate <- funnel_df$observed / funnel_df$sample_size
# 
# # National rate
# national_rate <- sum(provincial_data$low_income) / sum(provincial_data$total_pop)
# 
# # Create continuous funnel bands
# n_range <- range(provincial_data$total_pop)
# n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200))
# se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq)
# 
# funnel_bands <- data.frame(
#   n = n_seq,
#   rate = national_rate,
#   lower_2sd = national_rate - 2 * se_seq,
#   upper_2sd = national_rate + 2 * se_seq,
#   lower_3sd = national_rate - 3 * se_seq,
#   upper_3sd = national_rate + 3 * se_seq
# )
# 
# # Create funnel plot
# ggplot() +
#   # 3 SD band (outer)
#   geom_ribbon(
#     data = funnel_bands,
#     aes(x = n, ymin = lower_3sd, ymax = upper_3sd),
#     fill = "#9ecae1", alpha = 0.5
#   ) +
#   # 2 SD band (inner)
#   geom_ribbon(
#     data = funnel_bands,
#     aes(x = n, ymin = lower_2sd, ymax = upper_2sd),
#     fill = "#3182bd", alpha = 0.5
#   ) +
#   # National average line
#   geom_hline(
#     yintercept = national_rate,
#     linetype = "dashed", color = "#636363", linewidth = 0.8
#   ) +
#   # Points
#   geom_point(
#     data = funnel_df,
#     aes(x = sample_size, y = rate, color = abs(z_score) > 2),
#     size = 4
#   ) +
#   # Labels
#   ggrepel::geom_text_repel(
#     data = funnel_df,
#     aes(x = sample_size, y = rate, label = province),
#     size = 3, fontface = "bold",
#     max.overlaps = 15,
#     segment.color = "gray60"
#   ) +
#   scale_color_manual(
#     values = c("FALSE" = "#636363", "TRUE" = "#e6550d"),
#     guide = "none"
#   ) +
#   scale_x_log10(
#     labels = scales::comma,
#     breaks = c(1e4, 1e5, 1e6, 1e7)
#   ) +
#   scale_y_continuous(labels = scales::percent) +
#   labs(
#     title = "Funnel Plot: Provincial Low Income Rates",
#     subtitle = "Provinces outside bands have surprising rates given population size",
#     x = "Population (log scale)",
#     y = "Low Income Rate",
#     caption = "Source: Statistics Canada, 2021 Census"
#   ) +
#   theme_minimal() +
#   theme(
#     plot.title = element_text(face = "bold"),
#     panel.grid.minor = element_blank()
#   )

## ----temporal-comparison, eval=run_cancensus_examples-------------------------
# # Get provincial data for 2016 and 2021
# # Note: Variable IDs differ between Census years
# 
# # 2021 data
# prov_2021 <- get_census(
#   dataset = "CA21",
#   regions = list(C = "01"),
#   vectors = c(total_pop = "v_CA21_1085", low_income = "v_CA21_1090"),
#   level = "PR",
#   geo_format = NA,
#   labels = "short",
#   quiet = TRUE
# )
# prov_2021 <- prov_2021[!is.na(prov_2021$total_pop) & prov_2021$total_pop > 0, ]
# prov_2021$low_income_rate <- prov_2021$low_income / prov_2021$total_pop
# prov_2021$census_year <- 2021
# 
# # 2016 data (different vector IDs for same concept)
# prov_2016 <- get_census(
#   dataset = "CA16",
#   regions = list(C = "01"),
#   vectors = c(total_pop = "v_CA16_2540", low_income = "v_CA16_2555"),
#   level = "PR",
#   geo_format = NA,
#   labels = "short",
#   quiet = TRUE
# )
# prov_2016 <- prov_2016[!is.na(prov_2016$total_pop) & prov_2016$total_pop > 0, ]
# prov_2016$low_income_rate <- prov_2016$low_income / prov_2016$total_pop
# prov_2016$census_year <- 2016
# 
# # Compute surprise for each year's provincial low-income-rate distribution
# surprise_2021 <- surprise(
#   prov_2021,
#   observed = low_income_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# prov_2021$signed_surprise <- get_surprise(surprise_2021, "signed")
# 
# surprise_2016 <- surprise(
#   prov_2016,
#   observed = low_income_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# prov_2016$signed_surprise <- get_surprise(surprise_2016, "signed")
# 
# # Compare changes
# comparison <- merge(
#   prov_2016[, c("GeoUID", "Region Name", "signed_surprise")],
#   prov_2021[, c("GeoUID", "signed_surprise")],
#   by = "GeoUID",
#   suffixes = c("_2016", "_2021")
# )
# comparison$change <- comparison$signed_surprise_2021 - comparison$signed_surprise_2016
# print(comparison[order(abs(comparison$change), decreasing = TRUE), ])

## ----custom-models, eval=run_cancensus_examples-------------------------------
# # Using the Vancouver housing data
# # Create a custom model space
# 
# # Create custom model space for rates
# housing_models <- model_space(
#   bs_model_gaussian(),
#   bs_model_sampled()
# )
# 
# print(housing_models)
# 
# # Use surprise() with custom model space
# custom_result <- surprise(
#   housing_data,
#   observed = owner_rate,
#   models = housing_models
# )
# 
# housing_data$custom_surprise <- get_surprise(custom_result, "signed")
# 
# # Summary statistics
# cat("Custom model surprise summary:\n")
# summary(housing_data$custom_surprise)

## ----interpret-surprise, eval=run_cancensus_examples--------------------------
# # Categorize regions by surprise level
# income_data <- income_data %>%
#   mutate(
#     surprise_category = cut(
#       abs(signed_surprise),
#       breaks = c(0, 0.1, 0.3, 0.5, 1.0, Inf),
#       labels = c("Trivial", "Minor", "Moderate", "Substantial", "High"),
#       include.lowest = TRUE
#     )
#   )
# 
# # Summary by category
# cat("Census Divisions by surprise level:\n")
# print(table(income_data$surprise_category))

## ----session-info-------------------------------------------------------------
sessionInfo()