{-# LANGUAGE AllowAmbiguousTypes, RankNTypes, UnicodeSyntax, NoMonomorphismRestriction #-} {-# LANGUAGE BangPatterns, FlexibleInstances #-} {-# OPTIONS_HADDOCK prune #-} module Penrose.Optimizer (step) where import Penrose.Util import Penrose.Style import Penrose.GenOptProblem import Numeric.AD import Numeric.AD.Internal.On import Numeric.AD.Internal.Reverse import Numeric.AD.Internal.Sparse import qualified Numeric.LinearAlgebra as L import Debug.Trace import System.Random import System.Console.ANSI import Data.List (foldl') default (Int, Double) ------ Opt types, util functions, and params type ObjFn1 a = forall a . (Autofloat a) => [a] -> a -- used for duf type ObjFn2 a = forall a . (Autofloat a) => [a] -> [a] -> a type GradFn a = forall a . (Autofloat a) => [a] -> [a] ----- Various consts nanSub :: (Autofloat a) => a nanSub = 0 infinity :: Floating a => a infinity = 1/0 -- x/0 == Infinity for any x > 0 (x = 0 -> Nan, x < 0 -> -Infinity) -- all numbers are smaller than infinity except infinity, to which it's equal ----- Hyperparameters weightGrowthFactor :: (Autofloat a) => a -- for EP weight weightGrowthFactor = 10 epsUnconstr :: Floating a => a epsUnconstr = 10 ** (-2) epStop :: Floating a => a -- for EP diff epStop = 10 ** (-3) -- epStop = 10 ** (-5) -- epStop = 60 ** (-3) -- epStop = 10 ** (-1) -- epStop = 0.05 -- Parameters for Armijo-Wolfe line search -- NOTE: must maintain 0 < c1 < c2 < 1 c1 :: Floating a => a c1 = 0.001 -- for Armijo, corresponds to alpha in backtracking line search (see below for explanation) -- smaller c1 = shallower slope = less of a decrease in fn value needed = easier to satisfy -- turn Armijo off: c1 = 0 -- Nocedal p38: "In practice, c1 is chosen to be quite small, say 10−4." c2 :: Floating a => a c2 = 0.9 -- for Wolfe, is the factor decrease needed in derivative value -- new directional derivative value / old DD value <= c2 -- smaller c2 = smaller new derivative value = harder to satisfy -- turn Wolfe off: c1 = 1 (basically backatracking line search only) -- Nocedal p39: "Typical values of c2 are 0.9 when the search direction pk is chosen by a Newton or quasi-Newton method" -- true = force linesearch halt if interval gets too small; false = no forced halt intervalMin = True useLineSearch :: Bool useLineSearch = True useAutodiff :: Bool useAutodiff = True constT :: Floating a => a constT = 0.001 -- debugOpt = True debugOpt = False debugLineSearch = False debugBfgs = False trb :: String -> a -> a trb s x = if debugBfgs then trace "---" $ trace s x else x -- prints in left to right order tro :: String -> a -> a tro s x = if debugOpt then trace "---" $ trace s x else x -- prints in left to right order trl :: Show a => String -> a -> a trl s x = if debugLineSearch then trace "---" $ trace s $ traceShowId x else x -- prints in left to right order ----- Convergence criteria -- convergence criterion for EP -- if you want to use it for UO, needs a different epsilon epStopCond :: (Autofloat a) => [a] -> [a] -> a -> a -> Bool epStopCond x x' fx fx' = tro ("EP: \n||x' - x||: " ++ (show $ norm (x -. x')) ++ "\n|f(x') - f(x)|: " ++ (show $ abs (fx - fx'))) $ (norm (x -. x') <= epStop) || (abs (fx - fx') <= epStop) unconstrainedStopCond :: (Autofloat a) => [a] -> Bool unconstrainedStopCond gradEval = norm gradEval < epsUnconstr --------------------------------------- -- Policies -- stepPolicy :: State -> (Params, PolicyParams) -- stepPolicy s = -- -- Check overall convergence first -- let epStatus = optStatus $ (paramsr s) in -- let pparams = policyParams s in -- case epStatus of -- -- Generate new objective function and replace the optimization and policy params accordingly -- EPConverged -> -- -- TODO: clean up the step incrementing -- let pparams' = pparams { policySteps = 1 + policySteps pparams } in -- let (policyRes, psNew) = (policyFn s) (objFns s) (constrFns s) pparams' in -- See what the policy function wants -- case policyRes of -- Nothing -> (paramsr s, pparams' { policyState = psNew }) -- steps incremented, policy done -- Just newFns -> -- Policy keeps going -- let objFnNew = genObjfn (castTranslation $ transr s) (filter isObjFn newFns) (filter isConstr newFns) (varyingPaths s) -- -- TODO: check that these inputs are right -- -- Change obj function and restart optimization -- pparamsNew = pparams' { policyState = psNew, -- currFns = newFns } -- paramsNew = Params { weight = initWeight, -- optStatus = NewIter, -- overallObjFn = objFnNew, -- bfgsInfo = defaultBfgsParams } -- in tro ("Step policy, EP converged, new params:\n" ++ show (paramsNew, pparamsNew, newFns)) $ (paramsNew, pparamsNew) -- -- If not converged, optimize as usual, don't change policy mid-optimization -- _ -> tro ("Step policy, EP not converged, new params:\n" ++ show (paramsr s, pparams)) $ (paramsr s, pparams) --------------------------------------- -- Main optimization functions step :: State -> State step s = let (state', params') = stepShapes s -- (oConfig s) (paramsr s) (varyingState s) (rng s) s' = s { varyingState = state', paramsr = params' } -- NOTE: we intentionally discard the random generator here because -- we want to have consistent computation output in a single -- optimization session -- For the same reason, all subsequent step* functions such as -- stepShapes do not return the new random generator (!shapes', _, _) = evalTranslation s' -- Check the state and see if the overall objective function should be changed -- The policy may change EPConverged to a new iteration before the frontend sees it -- (paramsNew, pparamsNew) = stepPolicy s' -- For debugging oldParams = paramsr s in tro ("Params: \n" ++ show oldParams ++ "\n:") $ s' { -- shapesr = shapes', -- paramsr = paramsNew, -- policyParams = pparamsNew } shapesr = shapes', paramsr = params' } -- note: trans is not updated in state -- Note use of realToFrac to generalize type variables (on the weight and on the varying state) -- implements exterior point algo as described on page 6 here: -- https://www.me.utexas.edu/~jensen/ORMM/supplements/units/nlp_methods/const_opt.pdf stepShapes :: State -> ([Double], Params) stepShapes s = -- varying state -- if null vstate then error "empty state in stepshapes" else let (epWeight, epStatus) = (weight params, optStatus params) in case epStatus of -- start the outer EP optimization and the inner unconstrained optimization, recording initial EPstate NewIter -> let status' = UnconstrainedRunning (map realToFrac vstate) in (vstate', params { weight = initWeight, optStatus = status', bfgsInfo = defaultBfgsParams } ) -- check *weak* convergence of inner unconstrained opt. -- if UO converged, set opt state to converged and update UO state (NOT EP state) -- if not, keep running UO (inner state implicitly stored) -- note convergence checks are only on the varying part of the state UnconstrainedRunning lastEPstate -> -- doesn't use last EP state -- let unconstrConverged = unconstrainedStopCond gradEval in let unconstrConverged = epStopCond vstate vstate' (objFnApplied vstate) (objFnApplied vstate') in -- Two stopping conditions -- unconstrainedStopCond gradEval in if unconstrConverged then let status' = UnconstrainedConverged lastEPstate in -- update UO state only! (vstate', params { optStatus = status', bfgsInfo = defaultBfgsParams }) -- note vstate' (UO converged), not vstate else (vstate', params { bfgsInfo = bfgs' }) -- update UO state but not EP state; UO still running -- check EP convergence. if converged then stop, else increase weight, update states, and run UO again -- TODO some trickiness about whether unconstrained-converged has updated the correct state -- and whether to check WRT the updated state or not UnconstrainedConverged lastEPstate -> let epConverged = epStopCond lastEPstate (map r2f vstate) -- lastEPstate is last state for converged UO (objFnApplied lastEPstate) (objFnApplied (map r2f vstate)) in if epConverged then let status' = EPConverged in -- no more EP state (vstate, params { optStatus = status', bfgsInfo = defaultBfgsParams }) -- do not update UO state -- update EP state: to be the converged state from the most recent UO else let status' = UnconstrainedRunning (map realToFrac vstate) in -- increase weight let epWeight' = weightGrowthFactor * epWeight in -- trace ("Unconstrained converged. New weight: " ++ show epWeight') $ (vstate, params { weight = epWeight', optStatus = status', bfgsInfo = defaultBfgsParams }) -- done; don't update obj state or params; user can now manipulate EPConverged -> (vstate, params { bfgsInfo = defaultBfgsParams } ) -- TODO: implement EPConvergedOverride (for when the magnitude of the gradient is still large) -- TODO factor out--only unconstrainedRunning needs to run stepObjective, but EPconverged needs objfn where (config, params, vstate, g) = (oConfig s, paramsr s, varyingState s, rng s) (vstate', gradEval, bfgs') = stepWithObjective s -- objFnApplied = (overallObjFn params) g (r2f $ weight params) objFnApplied = evalEnergyOn s -- Given the time, state, and evaluated gradient (or other search direction) at the point, -- return the new state stepT :: Double -> Double -> Double -> Double stepT dt x dfdx = x - dt * dfdx -- Calculates the new state by calculating the directional derivatives (via autodiff) -- and timestep (via line search), then using them to step the current state. -- Also partially applies the objective function. stepWithObjective :: State -> ([Double], [Double], BfgsParams) stepWithObjective s = -- get timestep via line search, and evaluated gradient at the state let (t', gradEval, gradToUse, bfgs') = timeAndGrad config params objFnApplied state -- step each parameter of the state with the time and gradient state' = map (\(v, dfdv) -> stepT t' v dfdv) (zip state $ gradToUse) (fx, fx') = (objFnApplied state, objFnApplied state') in -- if fx' > fx then error ("Error: new energy is greater than old energy: " ++ show (fx', fx)) else tro ("\n----------------------------------------\n" ++ "\nopt params: \n" ++ (show params) ++ "\n||x' - x||: " ++ (show $ norm (state -. state')) ++ "\n|f(x') - f(x)|: " ++ (show $ abs (fx' - fx)) ++ "\nf(x'): \n" ++ (show fx') ++ "\ngradEval: \n" ++ (show gradEval) ++ "\n||gradEval||: \n" ++ (show $ norm gradEval) ++ "\ngradToUse: \n" ++ (show gradToUse) ++ "\n||gradToUse||: \n" ++ (show $ norm gradToUse) -- ++ "\nhessian: \n" ++ (show $ h) ++ "\nbfgs': \n" ++ (show bfgs') -- TODO: use trb ++ "\n timestep: \n" ++ (show t') ++ "\n original state: \n" ++ (show state) ++ "\n new state: \n" ++ (show state') ) (state', gradEval, bfgs') where (config, params, state, g) = (oConfig s, paramsr s, varyingState s, rng s) -- objFnApplied :: ObjFn1 b -- objFnApplied = (overallObjFn params) g cWeight objFnApplied = evalEnergyOn s cWeight = r2f $ weight params -- realToFrac generalizes the type variable `a` to the type variable `b`, which timeAndGrad expects -- a version of grad with a clearer type signature appGrad :: (Autofloat a) => (forall b . (Autofloat b) => [b] -> b) -> [a] -> [a] appGrad f l = grad f l instance Show (Numeric.AD.Internal.On.On (Numeric.AD.Internal.Reverse.Reverse s (Numeric.AD.Internal.Sparse.Sparse a))) where show a = "error: not sure how to derive show for hessian element" appHess :: (Autofloat a) => (forall b . (Autofloat b) => [b] -> b) -> [a] -> [[a]] appHess f l = hessian f l -- Precondition the gradient gradP :: OptConfig -> BfgsParams -> [Double] -> ObjFn1 a -> [Double] -> ([Double], BfgsParams) gradP config bfgsParams gradEval f state = let x_k = L.vector $ map r2f state grad_fx_k = L.vector gradEval in case optMethod config of GradientDescent -> (gradEval, bfgsParams) Newton -> -- Precondition gradient with the pseudoinverse of the hessian let h = appHess f state h_list = (map r2f $ concat h) :: [Double] hinv = L.pinv $ L.matrix (length gradEval) $ h_list gradPreconditioned = hinv L.#> (L.vector gradEval) in (L.toList gradPreconditioned, bfgsParams) BFGS -> -- Approximate inverse of hessian with the change in gradient (see Nocedal S9.1 p224) let grad_val = lastGrad bfgsParams :: Maybe [Double] h_val = invH bfgsParams :: Maybe [[Double]] state_val = lastState bfgsParams :: Maybe [Double] in case (h_val, grad_val, state_val) of (Nothing, Nothing, Nothing) -> -- First step. Initialize the approximation to the identity (not clear how else to approximate it) -- k=0 steps from x_0 to x_1: so on the first step, we take a normal gradient descent step let h_0 = L.ident $ length gradEval in (gradEval, bfgsParams { lastState = Just $ L.toList x_k, lastGrad = Just $ L.toList grad_fx_k, invH = Just $ L.toLists h_0 }) (Just h_km1L, Just grad_fx_km1L, Just x_km1L) -> -- For x_{k+1}, to compute H_k, we need the (k-1) info -- Our convention is that we are always working "at" k to compute k+1 -- x_0 doesn't require any H; x_1 (the first step) with k = 0 requires H_0 -- x_2 (the NEXT step) with k=1 requires H_1. For example> -- x_2 = x_1 - alpha_1 H_1 grad f(x_1) [GD step] -- H_1 = V_0 H_0 V_0 + rho_0 s_0 s_0^T [This is confusing because the book adds an extra +1 to the H index] -- V_0 = I - rho_0 y_0 s_0^T -- rho_0 = 1 / y_0^T s_0 -- s_0 = x_1 - x_0 -- y_0 = grad f(x_1) - grad f(x_0) let (h_km1, grad_fx_km1, x_km1) = (L.fromLists h_km1L, L.vector grad_fx_km1L, L.vector x_km1L) in let s_km1 = x_k - x_km1 y_km1 = grad_fx_k - grad_fx_km1 rho_km1 = 1 / (y_km1 `L.dot` s_km1) -- Scalar v_km1 = L.ident (length gradEval) - (rho_km1 `L.scale` y_km1 `L.outer` s_km1) -- Scaling can happen before outer h_k = (L.tr (v_km1) L.<> h_km1 L.<> v_km1) + (rho_km1 `L.scale` s_km1 `L.outer` s_km1) gradPreconditioned = h_k L.#> grad_fx_k in (L.toList gradPreconditioned, bfgsParams { lastState = Just $ L.toList x_k, lastGrad = Just $ L.toList grad_fx_k, invH = Just $ L.toLists h_k }) _ -> error "invalid BFGS state" LBFGS -> -- Approximate the inverse of the Hessian times the gradient -- Only using the last `m` gradient/state difference vectors, not building the full h_k matrix (Nocedal p226) let grad_prev = lastGrad bfgsParams x_prev = lastState bfgsParams ss_val = s_list bfgsParams ys_val = y_list bfgsParams km1 = numUnconstrSteps bfgsParams -- Our current step is k; the last step is km1 (k_minus_1) m = memSize bfgsParams in case (grad_prev, x_prev, ss_val, ys_val, km1) of -- Perform normal gradient descent on first step (Nothing, Nothing, [], [], 0) -> -- Store x_k, grad f(x_k) so we can compute s_k, y_k on next step let bfgsParams' = bfgsParams { lastState = Just $ L.toList x_k, lastGrad = Just $ L.toList grad_fx_k, s_list = [], y_list = [], numUnconstrSteps = km1 + 1 } in (gradEval, bfgsParams') (Just grad_fx_km1L, Just x_km1L, ssL, ysL, km1) -> let (grad_fx_km1, x_km1, ss, ys) = (L.fromList grad_fx_km1L, L.fromList x_km1L, map L.fromList ssL, map L.fromList ysL) in -- Compute s_{k-1} = x_k - x_{k-1} and y_{k-1} = (analogous with grads) -- Unlike Nocedal, compute the difference vectors first instead of last (same result, just a loop rewrite) -- Use the updated {s_i} and {y_i}. (If k < m, this reduces to normal BFGS, i.e. we use all the vectors so far) let (s_km1, y_km1) = (x_k - x_km1, grad_fx_k - grad_fx_km1) -- Newest vectors added to front (ss', ys') = (take m $ s_km1 : ss, take m $ y_km1 : ys) -- The limited-memory part: drop stale vectors gradPreconditioned = lbfgs grad_fx_k ss' ys' descentDirCheck = -gradPreconditioned `L.dot` grad_fx_k -- Reset L-BFGS if the result is not a descent direction, and use steepest descent direction -- https://github.com/JuliaNLSolvers/Optim.jl/issues/143 https://github.com/JuliaNLSolvers/Optim.jl/pull/144 in if descentDirCheck < epsd then (L.toList gradPreconditioned, bfgsParams { lastState = Just $ L.toList x_k, lastGrad = Just $ L.toList grad_fx_k, s_list = map L.toList ss', y_list = map L.toList ys', numUnconstrSteps = km1 + 1 }) else tro "L-BFGS did not find a descent direction. Resetting correction vectors." $ (gradEval, bfgsParams { lastState = Just $ L.toList x_k, lastGrad = Just $ L.toList grad_fx_k, s_list = [], y_list = [], numUnconstrSteps = km1 + 1 }) -- TODO: check the curvature condition y_k^T s_k > 0 (8.7) (Nocedal 201) -- https://github.com/JuliaNLSolvers/Optim.jl/issues/26 _ -> error "invalid L-BFGS state" type LVector = L.Vector L.R type LMatrix = L.Matrix L.R -- See Nocedal p225 -- expects ys and ss to be in order from most recent to oldest (k-1 ... k-m) -- expects length ys == length ss, length ys > 0, length ys > 0 lbfgs :: LVector -> [LVector] -> [LVector] -> LVector lbfgs grad_fx_k ss ys = let rhos = map calculate_rho $ zip ss ys -- The length of any list should be the number of stored vectors q_k = grad_fx_k (q_k_minus_m, alphas) = foldl' pull_q_back (q_k, []) (zip3 rhos ss ys) -- backward: for i = k-1 ... k-m -- Note the order of alphas will be from k-m through k-1 for the push_r_forward loop h_0_k = estimate_hess (head ys) (head ss) r_k_minus_m = h_0_k L.#> q_k_minus_m r_k = foldl' push_r_forward r_k_minus_m (zip (zip (reverse rhos) alphas) (zip (reverse ss) (reverse ys))) -- forward: for i = k-m .. k-1 (TODO: optimize out the reverses) in r_k -- is H_k * grad f(x_k) where calculate_rho :: (LVector, LVector) -> L.R calculate_rho (s, y) = 1 / ((y `L.dot` s) + epsd) pull_q_back :: (LVector, [L.R]) -> (L.R, LVector, LVector) -> (LVector, [L.R]) -- from i+1 to i pull_q_back (q_i_plus_1, alphas) (rho_i, s_i, y_i) = let alpha_i = rho_i * (s_i `L.dot` q_i_plus_1) -- scalar q_i = q_i_plus_1 - (alpha_i `L.scale` y_i) -- scalar * vector in (q_i, alpha_i : alphas) -- Note the order of alphas -- Scale I by an estimate of the size of the Hessian along the most recent search direction (Nocedal p226) estimate_hess :: LVector -> LVector -> LMatrix estimate_hess y_km1 s_km1 = let gamma_k = (s_km1 `L.dot` y_km1) / ((y_km1 `L.dot` y_km1) + epsd) in gamma_k `L.scale` (L.ident (L.size y_km1)) push_r_forward :: LVector -> ((L.R, L.R), (LVector, LVector)) -> LVector -- from i to i+1 push_r_forward r_i ((rho_i, alpha_i), (s_i, y_i)) = let beta_i = rho_i * (y_i `L.dot` r_i) -- scalar r_i_plus_1 = r_i + (alpha_i - beta_i) `L.scale` s_i -- scalar * vector in r_i_plus_1 estimateGradient :: ObjFn1 a -> [Double] -> [Double] estimateGradient f state = let len = length state h = 0.001 -- Choice of h really matters!!! This is not an accurate estimate. fx = f state -- time is O(|state|^2) dfx i = ((f $ replace i ((state !! i) + h) state) - fx) / h dfxs = map dfx [0..(len-1)] in dfxs where replace pos newVal list = take pos list ++ newVal : drop (pos+1) list -- Given the objective function, gradient function, timestep, and current state, -- return the timestep (found via line search) and evaluated gradient at the current state. -- the autodiff library requires that objective functions be polymorphic with Floating a timeAndGrad :: OptConfig -> Params -> ObjFn1 a -> [Double] -> (Double, [Double], [Double], BfgsParams) timeAndGrad config params f state = let gradEval = if useAutodiff then gradF state else estimateGradient f state (gradToUse_d, bfgs') = gradP config (bfgsInfo params) (map r2f gradEval :: [Double]) f state gradToUse = map r2f gradToUse_d -- Use line search to find a good timestep. If we use Newton's method, the descent direction uses the preconditioned gradient. descentDir = negL $ gradToUse timestep = let resT = if useLineSearch && useAutodiff then awLineSearch config f (duf descentDir) descentDir state else constT in -- hardcoded timestep if isNaN resT then error "returned timestep is NaN" else resT in tr "timeAndGrad: " (timestep, gradEval, gradToUse, bfgs') where gradF :: GradFn a gradF = appGrad f -- directional derivative of f at x in the direction of u (descent direction, which may not have unit norm) duf :: (Autofloat a) => [a] -> [a] -> a duf u x = u `dotL` gradF x linesearch_max :: Int linesearch_max = 100 -- TODO what's a reasonable limit (if any)? -- Implements Armijo-Wolfe line search as specified in Keenan's notes, converges on nonconvex fns as well -- based off Lewis & Overton, "Nonsmooth optimization via quasi-Newton methods -- duf = D_u(f), the directional derivative of f at descent direction u -- D_u(x) = . If u = -gradF(x) (as it is here), then D_u(x) = -||gradF(x)||^2 awLineSearch :: OptConfig -> ObjFn1 a -> ObjFn1 a -> [Double] -> [Double] -> Double awLineSearch config f duf descentDir x0 = let (a0, b0, t0) = (0, infinity, 1) -- Unit step length should be tried first for quasi-Newton methods. (Nocedal 201) -- drop while a&w are not satisfied OR the interval is large enough (numUpdates, (af, bf, tf)) = head $ dropWhile intervalOK_or_notArmijoAndWolfe $ zip [0..] $ iterate update (a0, b0, t0) in tro ("Line search # updates: " ++ show numUpdates) $ tf where update :: (Double, Double, Double) -> (Double, Double, Double) update (a, b, t) = let (a', b', sat) | not $ armijo t = trl "not armijo" (a, t, False) | not $ wolfe t = trl "not wolfe" (t, b, False) | otherwise = (a, b, True) in if sat then (a, b, t) -- if armijo and wolfe, then we use (a, b, t) as-is else if b' < infinity then trl "b' < infinity" (a', b', (a' + b') / 2) else trl "b' = infinity" (a', b', 2 * a') intervalOK_or_notArmijoAndWolfe :: (Int, (Double, Double, Double)) -> Bool intervalOK_or_notArmijoAndWolfe (numUpdates, (a, b, t)) = not $ if armijo t && wolfe t then trl ("stop: both sat. |descentDir at x0| = " ++ show (norm descentDir)) True else if abs (b - a) < minInterval then trl ("stop: interval too small. |descentDir at x0| = " ++ show (norm descentDir)) True else if numUpdates > linesearch_max then trl ("stop: number of line search updates exceeded max") True else False armijo :: Double -> Bool armijo t = (f (x0 +. t *. descentDir)) <= (fAtx0 + c1 * t * dufAtx0) weakWolfe :: Double -> Bool -- Better for nonsmooth functions weakWolfe t = (duf (x0 +. t *. descentDir)) >= (c2 * dufAtx0) strongWolfe :: Double -> Bool strongWolfe t = (abs (duf (x0 +. t *. descentDir))) <= (c2 * abs dufAtx0) -- Descent direction (at x0) must have a negative dot product with the gradient of f at x0. dufAtx0 :: Double -- TODO: this is redundant, cache it dufAtx0 = let res = duf x0 in trl (": " ++ show res) $ if res > 0 then tro "WARNING: descent direction doesn't satisfy condition" res else res fAtx0 :: Double fAtx0 = f x0 minInterval :: Double -- stop if the interval gets too small; might not terminate minInterval = if intervalMin then 10 ** (-10) else 0 -- TODO: the line search is very sensitive to this parameter. Blows up with 10**(-5). Why? wolfe :: Double -> Bool wolfe = weakWolfe