-- | This module provides several flavors of the value iteration
-- algorithm for solving MDPs.
module Algorithms.MDP.ValueIteration
( -- * Value iteration algorithms
valueIteration
, relativeValueIteration
, undiscountedRelativeValueIteration
-- * Helper functions for value iteration
, valueIterate
, relativeValueIterate
, undiscountedRVI
) where
import qualified Data.Vector as V
import Algorithms.MDP
-- | Compute the inner product between two vectors.
inner :: (Num t) => V.Vector t -> V.Vector t -> t
inner u v = V.sum (V.zipWith (*) u v)
-- | Compute an infinite sequence of estimates of cost functions
-- converging to the true cost function.
--
-- This method should only be used on discounted MDPs (e.g. an MDP
-- with a discount factor less than one).
valueIteration ::
(Ord t, Num t) =>
MDP a b t -- ^ The MDP to solve
-> [CF a b t] -- ^ An converging sequence of cost functions
valueIteration mdp =
let
states = _states mdp
actions = _actions mdp
zero = V.map (\s -> (s, V.head actions, 0)) states
in
iterate (valueIterate mdp) zero
-- | Computes the next estimate of the cost function.
valueIterate :: (Ord t, Num t) =>
MDP a b t -- ^ The MDP to solve
-> CF a b t -- ^ The current cost function estimate
-> CF a b t -- ^ The next cost function estimate
valueIterate mdp cf = V.imap (choiceFor mdp cf) (_states mdp)
-- | Finds the action that minimizes the one-step payoff using the
-- given cost function.
choiceFor :: (Ord t, Num t) =>
MDP a b t -- ^ The MDP we are solving
-> CF a b t -- ^ The current cost function
-> Int -- ^ The state for which we choose an action
-> a -- ^ The state for which we choose an action
-> (a, b, t) -- ^ The choice of action and associated cost
choiceFor mdp cf sIndex s =
let
actions = V.fromList [(_actions mdp) V.! ac' | ac' <- V.toList ((_actionSet mdp) V.! sIndex)]
cmp (_, x) (_, y) = compare x y
costs = V.map (costForAction mdp cf sIndex) (_actionSet mdp V.! sIndex)
pairs = V.zip actions costs
(ac, c) = V.minimumBy cmp pairs
in
(s, ac, c)
-- | Computes the cost implied by choosing an action in the given
-- state.
costForAction :: (Num t) =>
MDP a b t -- ^ The MDP we are solving.
-> CF a b t -- ^ The current cost function.
-> Int -- ^ The index of the state.
-> Int -- ^ The index of the action.
-> t -- ^ The estimated cost.
costForAction mdp cf sIndex ac =
let
alpha = _discount mdp
fixedCost = (_costs mdp) V.! ac V.! sIndex
transCost = inner (_trans mdp V.! ac V.! sIndex) (V.map (\(_, _, c) -> c) cf)
in
fixedCost + alpha * transCost
-- | An implementation of value iteration that computes monotonic
-- error bounds.
--
-- The error bounds provided at each iteration are additive in each
-- state. That is, given a cost estimate 'c' for a given state and
-- lower and upper bounds 'lb' and 'ub', the true cost is guaranteed
-- to be in the interval [c + lb, c + ub].
relativeValueIteration ::
(Read t, Ord t, Fractional t) =>
MDP a b t -- ^ The MDP to solve
-> [CFBounds a b t] -- ^ A converging sequence of cost functions.
relativeValueIteration mdp =
let
states = _states mdp
actions = _actions mdp
zero = V.map (\s -> (s, V.head actions, 0)) states
cf = CFBounds zero (read "-Infinity") (read "Infinity")
in
iterate (relativeValueIterate mdp) cf
-- | Computes the next estimate of the cost function and associated
-- error bounds.
relativeValueIterate ::
(Ord t, Fractional t) =>
MDP a b t
-> CFBounds a b t
-> CFBounds a b t
relativeValueIterate mdp (CFBounds cf _ _) =
let
alpha = _discount mdp
cf' = valueIterate mdp cf
(lb, ub) = (V.minimum diffs, V.maximum diffs)
where
diffs = V.zipWith (\(_, _, a) (_, _, b) -> a - b) cf' cf
scale = alpha / (1 - alpha)
in
CFBounds
{ _CF = cf'
, _lb = scale * lb
, _ub = scale * ub
}
-- | Relative value iteration for undiscounted MDPs.
undiscountedRelativeValueIteration ::
(Ord t, Fractional t, Read t) =>
MDP a b t -- ^ The MDP to solve
-> [CFBounds a b t] -- ^ A converging sequence of cost functions
undiscountedRelativeValueIteration mdp =
let
states = _states mdp
actions = _actions mdp
trans = _trans mdp
update s v = V.imap (\i z -> tau * z + if i == s then (1 - tau) else 0) v
trans' = V.map (\vv -> V.imap (\s v -> update s v) vv) trans
tau = 0.5
mdp' = mdp {_trans = trans'}
zeroV = V.map (\s -> (s, V.head actions, 0)) states
zero = CFBounds zeroV (read "-Infinity") (read "Infinity")
distinguished = 0
in
iterate (undiscountedRVI mdp' distinguished) zero
-- | Performs a single iterate of relative value iteration for the
-- undiscounted problem.
undiscountedRVI :: (Ord t, Fractional t) =>
MDP a b t
-> Int
-> CFBounds a b t
-> CFBounds a b t
undiscountedRVI mdp distinguished (CFBounds h _ _) =
let
th = valueIterate mdp h
(_, _, distinguishedCost) = th V.! distinguished
th' = V.map (\(s, ac, z) -> (s, ac, z - distinguishedCost)) th
(lb, ub) = (V.minimum diffs, V.maximum diffs)
where
diffs = V.zipWith (\(_, _, a) (_, _, b) -> a - b) th h
in
CFBounds th' lb ub