## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(compositional.mle)
set.seed(42)

## ----mixture-problem----------------------------------------------------------
# Generate mixture data
x_mix <- c(rnorm(70, mean = 0, sd = 1), rnorm(30, mean = 5, sd = 1.5))

# Log-likelihood for 2-component Gaussian mixture
mixture_loglike <- function(theta) {
  mu1 <- theta[1]; s1 <- theta[2]
  mu2 <- theta[3]; s2 <- theta[4]
  pi1 <- theta[5]

  if (s1 <= 0 || s2 <= 0 || pi1 <= 0 || pi1 >= 1) return(-Inf)

  # Log-sum-exp for numerical stability
  log_p1 <- log(pi1) + dnorm(x_mix, mu1, s1, log = TRUE)
  log_p2 <- log(1 - pi1) + dnorm(x_mix, mu2, s2, log = TRUE)
  log_max <- pmax(log_p1, log_p2)
  sum(log_max + log(exp(log_p1 - log_max) + exp(log_p2 - log_max)))
}

problem_mix <- mle_problem(
  loglike = mixture_loglike,
  constraint = mle_constraint(
    support = function(theta) theta[2] > 0 && theta[4] > 0 &&
                              theta[5] > 0 && theta[5] < 1,
    project = function(theta) c(theta[1], max(theta[2], 0.1), theta[3],
                                max(theta[4], 0.1), min(max(theta[5], 0.01), 0.99))
  )
)

## ----single-solver------------------------------------------------------------
# Random starting point
theta0 <- c(-2, 1, 3, 1, 0.5)

result_simple <- gradient_ascent(max_iter = 100)(problem_mix, theta0)

cat("Simple gradient ascent:\n")
cat("  mu1 =", round(result_simple$theta.hat[1], 2),
    " mu2 =", round(result_simple$theta.hat[3], 2), "\n")
cat("  Log-likelihood:", round(result_simple$loglike, 2), "\n")

## ----coarse-to-fine-----------------------------------------------------------
# K-means for smart initialization
km <- kmeans(x_mix, centers = 2)
mu1_init <- min(km$centers)
mu2_init <- max(km$centers)
s1_init <- sd(x_mix[km$cluster == which.min(km$centers)])
s2_init <- sd(x_mix[km$cluster == which.max(km$centers)])
pi_init <- mean(km$cluster == which.min(km$centers))

theta_init <- c(mu1_init, s1_init, mu2_init, s2_init, pi_init)

# Coarse-to-fine: random search -> gradient ascent
strategy <- random_search(
  sampler = function() c(runif(1, -5, 10), runif(1, 0.5, 3),
                         runif(1, -5, 10), runif(1, 0.5, 3),
                         runif(1, 0.2, 0.8)),
  n = 50
) %>>% gradient_ascent(max_iter = 200)

result_coarse <- strategy(problem_mix, theta_init)

cat("Coarse-to-fine strategy:\n")
cat("  mu1 =", round(result_coarse$theta.hat[1], 2),
    " mu2 =", round(result_coarse$theta.hat[3], 2), "\n")
cat("  Log-likelihood:", round(result_coarse$loglike, 2), "\n")

## ----restarts-----------------------------------------------------------------
# Custom sampler for mixture parameters
mixture_sampler <- function() {
  c(runif(1, -5, 10),    # mu1
    runif(1, 0.5, 3),     # sigma1
    runif(1, -5, 10),     # mu2
    runif(1, 0.5, 3),     # sigma2
    runif(1, 0.2, 0.8))   # pi
}

strategy <- with_restarts(
  gradient_ascent(max_iter = 100),
  n = 20,
  sampler = mixture_sampler
)

result_restarts <- strategy(problem_mix, theta_init)

cat("Random restarts (20 starts):\n")
cat("  mu1 =", round(result_restarts$theta.hat[1], 2),
    " mu2 =", round(result_restarts$theta.hat[3], 2), "\n")
cat("  Log-likelihood:", round(result_restarts$loglike, 2), "\n")
cat("  Best restart:", result_restarts$solver_info$composition$best_restart, "of", result_restarts$solver_info$composition$n_restarts, "\n")

## ----racing-------------------------------------------------------------------
strategy <- gradient_ascent() %|% bfgs() %|% nelder_mead()

result_race <- strategy(problem_mix, theta_init)

cat("Racing strategy:\n")
cat("  Winner:", result_race$solver_info$name, "\n")
cat("  mu1 =", round(result_race$theta.hat[1], 2),
    " mu2 =", round(result_race$theta.hat[3], 2), "\n")
cat("  Log-likelihood:", round(result_race$loglike, 2), "\n")

## ----global-local-------------------------------------------------------------
strategy <- sim_anneal(temp_init = 10, cooling_rate = 0.95, max_iter = 500) %>>%
  gradient_ascent(max_iter = 100)

result_global <- strategy(problem_mix, theta_init)

cat("Global + local strategy:\n")
cat("  mu1 =", round(result_global$theta.hat[1], 2),
    " mu2 =", round(result_global$theta.hat[3], 2), "\n")
cat("  Log-likelihood:", round(result_global$loglike, 2), "\n")

## ----diagnose, fig.height=4---------------------------------------------------
# Enable full tracing
trace_cfg <- mle_trace(values = TRUE, gradients = TRUE, path = TRUE)

# Simple problem for clear visualization
simple_problem <- mle_problem(
  loglike = function(theta) -sum((theta - c(3, 2))^2),
  score = function(theta) -2 * (theta - c(3, 2)),
  constraint = mle_constraint(support = function(theta) TRUE)
)

result_traced <- gradient_ascent(max_iter = 30)(
  simple_problem, c(-2, -1), trace = trace_cfg
)

# Visualize convergence
plot(result_traced, which = c("loglike", "path"))

## ----trace-data---------------------------------------------------------------
path_df <- optimization_path(result_traced)
head(path_df)

# Check convergence rate
if (nrow(path_df) > 5) {
  improvement <- diff(path_df$loglike)
  cat("\nLog-likelihood improvement per iteration:\n")
  print(round(improvement[1:min(5, length(improvement))], 4))
}

## ----benchmark----------------------------------------------------------------
# Test problem: bimodal likelihood
bimodal <- mle_problem(
  loglike = function(theta) {
    log(0.3 * dnorm(theta, 2, 0.5) + 0.7 * dnorm(theta, 7, 0.5))
  },
  constraint = mle_constraint(support = function(theta) TRUE)
)

# Strategies to compare
strategies <- list(
  "Gradient Ascent" = gradient_ascent(max_iter = 100),
  "BFGS" = bfgs(),
  "Nelder-Mead" = nelder_mead(),
  "SA + GA" = sim_anneal(max_iter = 200) %>>% gradient_ascent(),
  "Restarts (5)" = with_restarts(gradient_ascent(), n = 5,
                                  sampler = function() runif(1, -5, 15))
)

# Run each strategy multiple times
results <- data.frame(
  Strategy = character(),
  LogLike = numeric(),
  Theta = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(strategies)) {
  for (rep in 1:3) {
    set.seed(rep * 100)
    theta0 <- runif(1, -5, 15)
    result <- tryCatch(
      strategies[[name]](bimodal, theta0),
      error = function(e) NULL
    )
    if (!is.null(result)) {
      results <- rbind(results, data.frame(
        Strategy = name,
        LogLike = result$loglike,
        Theta = result$theta.hat[1],
        stringsAsFactors = FALSE
      ))
    }
  }
}

# Summarize results
library(dplyr)
results %>%
  group_by(Strategy) %>%
  summarize(
    MeanLogLike = round(mean(LogLike), 2),
    BestTheta = round(Theta[which.max(LogLike)], 2),
    .groups = "drop"
  ) %>%
  arrange(desc(MeanLogLike))

