## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----install, eval = FALSE----------------------------------------------------
# # Install from CRAN (when available)
# install.packages("pep725")
# 
# # Or install development version from GitHub
# # First install devtools if you don't have it:
# # install.packages("devtools")
# devtools::install_github("matthias-da/pep725")

## ----setup, message=FALSE, warning=FALSE--------------------------------------
library(pep725)

## ----download, message=FALSE--------------------------------------------------
# Download synthetic data (cached locally after first download)
pep <- pep_download()

## ----cachestatus, message=FALSE-----------------------------------------------
# Check cache status - shows file location and size
pep_cache_info()

# If you need to clear the cache (e.g., to get an updated version):
# pep_cache_clear()

## ----load-seed, eval = FALSE--------------------------------------------------
# # Load the included seed dataset (small, for quick tests only)
# data(pep_seed)
# 
# # This is a small subset (~1,300 rows) useful for:
# # - Testing code quickly
# # - Working offline
# # - Running examples in documentation

## ----simulate, eval = FALSE---------------------------------------------------
# # Generate synthetic data based on your real data
# # This preserves statistical properties while anonymizing observations
# pep_synthetic <- pep_simulate(your_real_pep_data)
# 
# # You can then share pep_synthetic freely

## ----import, eval = FALSE-----------------------------------------------------
# # Import from downloaded PEP725 files
# # Point to the folder containing your downloaded CSV files
# pep <- pep_import("path/to/pep725_data/")

## ----own-data, eval = FALSE---------------------------------------------------
# # Prepare your data with the required columns
# my_data <- data.frame(
#   s_id = c("MY001", "MY001", "MY002"),
#   lon = c(10.1, 10.1, 11.3),
#   lat = c(47.5, 47.5, 48.1),
#   genus = c("Malus", "Malus", "Malus"),
#   species = c("Malus domestica", "Malus domestica", "Malus domestica"),
#   phase_id = c(60, 65, 60),
#   year = c(2020, 2020, 2020),
#   day = c(110, 125, 108)
# )
# 
# # Convert to a pep object — validates required columns automatically
# my_pep <- as.pep(my_data)

## ----structure----------------------------------------------------------------
# View the data - the print method shows a summary
print(pep)

## ----first-look, fig.height = 4-----------------------------------------------
# Where are the stations?
plot(pep, type = "map")

# How are phenological events distributed across the year?
plot(pep, type = "histogram")

## ----class--------------------------------------------------------------------
# Check the class hierarchy
class(pep)

## ----summary------------------------------------------------------------------
# Default summary - shows top species by observation count
summary(pep)

## ----summary-phase------------------------------------------------------------
# Summary by phenological phase
summary(pep, by = "phase")

## ----summary-country----------------------------------------------------------
# Summary by country (showing top 5)
summary(pep, by = "country", top = 5)

## ----summary-year-------------------------------------------------------------
# Temporal coverage summary
summary(pep, by = "year")

## ----subset-------------------------------------------------------------------
# Filter to apple data using data.table syntax
# The syntax is: dataset[row_filter, column_selection]
apple <- pep[species == "Malus domestica"]
print(apple)

# Filter by year range
recent <- pep[year >= 2000 & year <= 2015]
print(recent)

# The summary method works on subsets too
summary(recent)

## ----bbch---------------------------------------------------------------------
# Get descriptions for BBCH codes present in your data
codes <- unique(pep$phase_id)
bbch_description(codes)

## ----filter-phase-------------------------------------------------------------
# Get all flowering observations (BBCH 60-69)
flowering <- pep[phase_id >= 60 & phase_id <= 69]
cat("Flowering observations:", nrow(flowering), "\n")

# Get only full flowering (BBCH 65)
full_flowering <- pep[phase_id == 65]
cat("Full flowering observations:", nrow(full_flowering), "\n")

# Get harvest-related observations
harvest <- pep[phase_id %in% c(87, 89)]
cat("Harvest observations:", nrow(harvest), "\n")

## ----coverage-all-------------------------------------------------------------
# Get a complete coverage report
pep_coverage(pep)

## ----coverage-temporal--------------------------------------------------------
# Temporal coverage with a visualization
pep_coverage(pep, kind = "temporal", plot = TRUE)

## ----coverage-geo-------------------------------------------------------------
# Geographical coverage - which countries have most data
pep_coverage(pep, kind = "geographical", top = 5)

## ----coverage-species---------------------------------------------------------
# Species coverage - which species are best represented
pep_coverage(pep, kind = "species", top = 5)

## ----coverage-by-country------------------------------------------------------
# Temporal coverage broken down by country
cov_by_country <- pep_coverage(pep, kind = "temporal", by = "country")
# Show observations by group
cov_by_country$temporal$obs_by_group

## ----quick-analysis-----------------------------------------------------------
# Step 1: Filter to apple flowering (BBCH 60 = first flowers)
apple_flowering <- pep[species == "Malus domestica" & phase_id == 60]

cat("Apple flowering observations:", nrow(apple_flowering), "\n")
cat("Year range:", min(apple_flowering$year), "-", max(apple_flowering$year), "\n")
cat("Countries:", length(unique(apple_flowering$country)), "\n")

# Step 2: Calculate annual mean DOY
# The data.table syntax here means:
#   - Group by year
#   - Calculate mean DOY and count observations
#   - Order by year
annual_mean <- apple_flowering[, .(
  mean_doy = mean(day, na.rm = TRUE),
  n_obs = .N
), by = year][order(year)]

# Step 3: Look at the results
print(annual_mean)

## ----quick-analysis-plot------------------------------------------------------
# Step 4: Plot the trend with ggplot2
library(ggplot2)

p <- ggplot(annual_mean, aes(x = year, y = mean_doy)) +
  geom_point(aes(size = n_obs), color = "steelblue", alpha = 0.6) +
  geom_line(color = "steelblue", alpha = 0.4) +
  geom_smooth(method = "lm", color = "red", linetype = 2, se = FALSE) +
  labs(
    title = "Apple Flowering Date Over Time",
    x = "Year", y = "Mean Day of Year",
    size = "Observations"
  ) +
  theme_minimal()
print(p)

# Step 5: Quantify the trend
trend <- lm(mean_doy ~ year, data = annual_mean)
slope <- coef(trend)[2]
cat("\nTrend:", round(slope * 10, 2), "days per decade\n")
if (slope < 0) {
  cat("Interpretation: Flowering is getting EARLIER\n")
} else {
  cat("Interpretation: Flowering is getting LATER\n")
}

## ----quick-analysis-vine------------------------------------------------------
# Step 1: Filter to grapevine flowering (BBCH 65 = full flowering)
vine_flowering <- pep[species == "Vitis vinifera" & phase_id == 65]

cat("Grapevine flowering observations:", nrow(vine_flowering), "\n")
cat("Year range:", min(vine_flowering$year), "-", max(vine_flowering$year), "\n")
cat("Countries:", length(unique(vine_flowering$country)), "\n")

# Step 2: Calculate annual mean DOY
vine_annual <- vine_flowering[, .(
  mean_doy = mean(day, na.rm = TRUE),
  n_obs = .N
), by = year][order(year)]

# Step 3: Plot the trend
p <- ggplot(vine_annual, aes(x = year, y = mean_doy)) +
  geom_point(aes(size = n_obs), color = "purple", alpha = 0.6) +
  geom_line(color = "purple", alpha = 0.4) +
  geom_smooth(method = "lm", color = "red", linetype = 2, se = FALSE) +
  labs(
    title = "Grapevine Flowering Date Over Time",
    x = "Year", y = "Mean Day of Year",
    size = "Observations"
  ) +
  theme_minimal()
print(p)

trend <- lm(mean_doy ~ year, data = vine_annual)
slope <- coef(trend)[2]
cat("\nTrend:", round(slope * 10, 2), "days per decade\n")

## ----eval = FALSE-------------------------------------------------------------
# vignette("phenological-analysis", package = "pep725")

## ----eval = FALSE-------------------------------------------------------------
# vignette("spatial-patterns", package = "pep725")

## ----eval = FALSE-------------------------------------------------------------
# vignette("data-quality", package = "pep725")

## ----session------------------------------------------------------------------
sessionInfo()