## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup, message=FALSE, warning=FALSE--------------------------------------
library(pep725)
library(data.table)

# Download the synthetic dataset
pep <- pep_download()

# For this vignette, we'll focus on flowering phases
flowering <- pep[phase_id %in% c(60, 65)]
cat("Flowering observations:", nrow(flowering), "\n")
cat("Species:", length(unique(flowering$species)), "\n")
cat("Year range:", min(flowering$year), "-", max(flowering$year), "\n")

## ----flag-basic---------------------------------------------------------------
# Flag outliers using the default 30-day rule
outliers <- pep_flag_outliers(
  pep = flowering,
  method = "30day",
  by = c("species", "phase_id")
)

print(outliers)

## ----flag-methods, eval=FALSE-------------------------------------------------
# # MAD method: flag if > 3 MAD from median (robust)
# outliers_mad <- pep_flag_outliers(flowering, method = "mad", threshold = 3)

## ----iqr-method, eval=FALSE---------------------------------------------------
# # IQR method: flag if outside 1.5 * IQR (standard boxplot rule)
# outliers_iqr <- pep_flag_outliers(flowering, method = "iqr", threshold = 1.5)

## ----zscore-method, eval=FALSE------------------------------------------------
# # Z-score method: flag if |z| > 3 (assumes normal distribution)
# outliers_zscore <- pep_flag_outliers(flowering, method = "zscore", threshold = 3)

## ----gam-method, eval=FALSE---------------------------------------------------
# # Model-based detection across the whole species x phase group.
# outliers_gam <- pep_flag_outliers(
#   apple_flowering,
#   by = c("genus", "species", "phase_id"),
#   method = "gam_residual",
#   threshold = 3.5
# )

## ----mahalanobis-method, eval=FALSE-------------------------------------------
# # Multivariate detection on the phase profile per station-year.
# # by = c("genus", "species") pools across phases so the covariance
# # is estimated from the joint (BBCH 60, BBCH 65, BBCH 89) distribution.
# outliers_mahal <- pep_flag_outliers(
#   apple,
#   by = c("genus", "species"),
#   method = "mahalanobis"
# )

## ----flag-summary-------------------------------------------------------------
summary(outliers)

## ----flag-compare-------------------------------------------------------------
# Check outliers for grapevine
vine_flowering <- pep[species == "Vitis vinifera" & phase_id %in% c(60, 65)]

if (nrow(vine_flowering) > 0) {
  outliers_vine <- pep_flag_outliers(
    pep = vine_flowering,
    method = "30day",
    by = c("species", "phase_id")
  )

  cat("Outlier comparison:\n")
  cat("All flowering species: ", round(100 * mean(outliers$is_outlier), 2), "%\n")
  cat("Grapevine only:        ", round(100 * mean(outliers_vine$is_outlier), 2), "%\n")
} else {
  cat("No grapevine flowering data available for comparison.\n")
}

## ----plot-overview, fig.height=6----------------------------------------------
# Overview of outlier patterns
pep_plot_outliers(outliers, type = "overview")

## ----plot-seasonal, fig.height=5----------------------------------------------
# When do outliers occur in the year?
pep_plot_outliers(outliers, type = "seasonal")

## ----plot-detail, fig.height=5------------------------------------------------
# See outliers in context of all observations
pep_plot_outliers(outliers, type = "detail", n_top = 15)

## ----plot-map, fig.height=5---------------------------------------------------
# Where are outliers located?
pep_plot_outliers(outliers, type = "map")

## ----plot-diagnostic-gam, eval=FALSE------------------------------------------
# # For gam_residual: classical model diagnostics
# outliers_gam <- pep_flag_outliers(
#   apple_flowering,
#   by = c("genus", "species", "phase_id"),
#   method = "gam_residual"
# )
# pep_plot_outliers(outliers_gam, type = "diagnostic")

## ----plot-diagnostic-mahal, eval=FALSE----------------------------------------
# # For mahalanobis: MD-specific diagnostics
# outliers_mahal <- pep_flag_outliers(
#   apple,
#   by = c("genus", "species"),
#   method = "mahalanobis"
# )
# pep_plot_outliers(outliers_mahal, type = "diagnostic")

## ----plot-profile, eval=FALSE-------------------------------------------------
# # Phase-profile parallel coordinates (primary Mahalanobis figure).
# pep_plot_outliers(outliers_mahal, type = "profile")

## ----completeness-basic-------------------------------------------------------
# Check completeness by station and phase
# Use year_range to focus on a specific period
completeness <- pep_completeness(
  pep = flowering,
  by = c("s_id", "phase_id"),
  year_range = c(1990, 2020)
)

print(completeness)

## ----completeness-summary-----------------------------------------------------
summary(completeness)

## ----completeness-filter------------------------------------------------------
# Get stations with good coverage (>= 70%)
good_coverage <- completeness[completeness_pct >= 70]
cat("Stations with >= 70% coverage:", nrow(good_coverage), "\n")

# Use these for trend analysis
good_stations <- unique(good_coverage$s_id)
flowering_complete <- flowering[s_id %in% good_stations]
cat("Observations from complete stations:", nrow(flowering_complete), "\n")

## ----completeness-plot, fig.height=5------------------------------------------
plot(completeness)

## ----check-basic--------------------------------------------------------------
# Check if expected phases are present for apple
apple <- pep[species == "Malus domestica"]

phase_check <- pep_check_phases(
  pep = apple,
  expected = c(60, 65, 87)  # flowering, full flowering, fruit maturity
)

print(phase_check)

## ----check-multi--------------------------------------------------------------
# Check phases for multiple species
multi_check <- pep_check_phases_multi(
  pep = pep,
  species_list = c("Malus domestica", "Vitis vinifera"),
  expected = c(60, 65, 87)
)

print(multi_check)

## ----workflow-----------------------------------------------------------------
# ══════════════════════════════════════════════════════════════════════════════
# STEP 1: Assess temporal completeness
# ══════════════════════════════════════════════════════════════════════════════
# Why: Incomplete stations can bias trend estimates and normals

completeness <- pep_completeness(flowering, by = c("s_id", "phase_id"))
good_stations <- completeness[completeness_pct >= 50, s_id]
fl_filtered <- flowering[s_id %in% good_stations]

cat("Kept", length(good_stations), "stations with >= 50% completeness\n")

# ══════════════════════════════════════════════════════════════════════════════
# STEP 2: Flag statistical outliers
# ══════════════════════════════════════════════════════════════════════════════
# Why: Identify observations that deviate from expected patterns

outliers_wf <- pep_flag_outliers(fl_filtered, method = "mad", threshold = 3)

cat("Flagged", sum(outliers_wf$is_outlier), "outliers",
    "(", round(100 * mean(outliers_wf$is_outlier), 1), "% )\n")

# ══════════════════════════════════════════════════════════════════════════════
# STEP 3: Make informed decisions about exclusion
# ══════════════════════════════════════════════════════════════════════════════
# Key principle: Document your decisions!

# Option A: Strict cleaning (for normals calculation)
fl_strict <- outliers_wf[is_outlier == FALSE]

# Option B: Moderate cleaning (keep moderate outliers)
fl_moderate <- outliers_wf[is_outlier == FALSE | abs(deviation) < 60]

cat("Strict cleaning keeps:", nrow(fl_strict), "obs\n")
cat("Moderate cleaning keeps:", nrow(fl_moderate), "obs\n")

# ══════════════════════════════════════════════════════════════════════════════
# STEP 4: Proceed with analysis on cleaned data
# ══════════════════════════════════════════════════════════════════════════════

normals <- pheno_normals(fl_moderate, period = 1991:2020, min_years = 5)
cat("Normals calculated for", sum(!is.na(normals$mean_doy)), "groups\n")

## ----eval = FALSE-------------------------------------------------------------
# vignette("getting-started", package = "pep725")

## ----eval = FALSE-------------------------------------------------------------
# vignette("phenological-analysis", package = "pep725")

## ----eval = FALSE-------------------------------------------------------------
# vignette("spatial-patterns", package = "pep725")

## ----session------------------------------------------------------------------
sessionInfo()

