## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(multiobjectiveMDP)

## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
action_sets <- list(c(1, 2), c(1, 2))
no_objectives <- 2
# Generate a two-state bi-objective MDP having three epochs and two actions per state
finite_horizon_MMDP <- generate_rand_MMDP(no_states, action_sets, horizon = 3, no_objectives)
# Inspect the transition probabilities
P <- finite_horizon_MMDP$P
P

# Inspect the rewards
R <- finite_horizon_MMDP$R
R

## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
action_sets <- list(c(1, 2), c(1, 2))
no_objectives <- 2
# Generate an infinite-horizon two-state bi-objective MDP having two actions per state
stationary_MMDP <- generate_rand_MMDP(no_states, action_sets, horizon = Inf, no_objectives)

# Inspect the transition probabilities
P <- stationary_MMDP$P
P

# Inspect the rewards
R <- stationary_MMDP$R
R

## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
# The action set for state 1 is {1, 2}; for state 2 it is {1, 2, 3}
action_sets <- list(c(1, 2), c(1, 2, 3))
horizon <- 5
policy <- matrix(data = c(sample(action_sets[[1]], size = horizon-1, replace = T), sample(action_sets[[2]], size = horizon-1, replace = T)), nrow = no_states, ncol = horizon-1)
policy

## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
# The action set for state 1 is {1, 2}; for state 2 it is {1, 2, 3}
action_sets <- list(c(1, 2), c(1, 2, 3))
policy <- c(sample(action_sets[[1]], size = 1), sample(action_sets[[2]], size = 1))
policy

## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
action_sets <- list(c(1, 2), c(1, 2, 3))
horizon <- 5
no_objectives <- 2
# Generate a two-state bi-objective MDP with the specified action sets and horizon
MMDP <- generate_rand_MMDP(no_states, action_sets, horizon, no_objectives)
transition_probabilities <- MMDP$P
rewards <- MMDP$R

policy <- matrix(data = c(sample(action_sets[[1]], size = horizon-1, replace = T),
                          sample(action_sets[[2]], size = horizon-1, replace = T)),
                 nrow = no_states, ncol = horizon-1)
policy

# Evaluate the expected total reward of policy over the five epochs
evaluate_finite_horizon_MMDP_markov_policy(transition_probabilities, rewards, policy)

# What if a discount factor of 70% is applied at each epoch? 
rho <- .7
evaluate_finite_horizon_MMDP_markov_policy(transition_probabilities, rewards, policy, rho)


## -----------------------------------------------------------------------------
set.seed(1234)
no_states <- 2
action_sets <- list(c(1, 2), c(1, 2))
no_objectives <- 2
# Generate an infinite-horizon two-state bi-objective MDP providing two actions per state
stationary_MMDP <- generate_rand_MMDP(no_states, action_sets, horizon = Inf, no_objectives)
# Consider the pure policy that recommends action 2 for state 1 and action 1 for state 2
policy <- c(2, 1)
# Evaluate the policy in the infinite-horizon model generated above for rho = .7
evaluate_discounted_MMDP_pure_policy(stationary_MMDP$P, stationary_MMDP$R, policy, rho = .7)


## -----------------------------------------------------------------------------
set.seed(1234)
# Set up a bi-objective infinite-horizon MMDP
no_states <- 2
action_sets <- list(c(1, 2), c(1, 2, 3))
no_objectives <- 2
stationary_MMDP <- generate_rand_MMDP(no_states, action_sets, horizon = Inf, no_objectives)
stationary_transition_probabilities <- stationary_MMDP$P
stationary_rewards <- stationary_MMDP$R
rho <- .7

# Use policy iteration to locate the efficient pure policies
solution <- solve_discounted_MMDP_policy_iteration(stationary_transition_probabilities, stationary_rewards, rho)
solution$policies

# Inspect their expected discounted total rewards
solution$value_functions